From 6aff5cfbc1adff2f64998d8d70a957fefdb4ea05 Mon Sep 17 00:00:00 2001 From: amazingfate Date: Wed, 20 Mar 2024 10:43:37 +0800 Subject: [PATCH] rockchip-rk3588: edge: add panthor support --- .../kernel/linux-rockchip-rk3588-edge.config | 2 + .../0041-panthor-v6.patch | 14855 ++++++++++++++++ ...64-dts-rockchip-rk3588-Add-GPU-nodes.patch | 150 + ...-rockchip-rk3588-rock5b-Add-GPU-node.patch | 38 + 4 files changed, 15045 insertions(+) create mode 100644 patch/kernel/rockchip-rk3588-edge/0041-panthor-v6.patch create mode 100644 patch/kernel/rockchip-rk3588-edge/0042-arm64-dts-rockchip-rk3588-Add-GPU-nodes.patch create mode 100644 patch/kernel/rockchip-rk3588-edge/0043-arm64-dts-rockchip-rk3588-rock5b-Add-GPU-node.patch diff --git a/config/kernel/linux-rockchip-rk3588-edge.config b/config/kernel/linux-rockchip-rk3588-edge.config index c9b93d621e..3639beaf8b 100644 --- a/config/kernel/linux-rockchip-rk3588-edge.config +++ b/config/kernel/linux-rockchip-rk3588-edge.config @@ -6374,6 +6374,7 @@ CONFIG_DRM_DISPLAY_HDMI_HELPER=y CONFIG_DRM_DP_CEC=y CONFIG_DRM_TTM=m CONFIG_DRM_EXEC=m +CONFIG_DRM_GPUVM=m CONFIG_DRM_BUDDY=m CONFIG_DRM_VRAM_HELPER=m CONFIG_DRM_TTM_HELPER=m @@ -6600,6 +6601,7 @@ CONFIG_DRM_XEN=y CONFIG_DRM_XEN_FRONTEND=m CONFIG_DRM_LIMA=m CONFIG_DRM_PANFROST=m +CONFIG_DRM_PANTHOR=m CONFIG_DRM_TIDSS=m CONFIG_DRM_GUD=m CONFIG_DRM_SSD130X=m diff --git a/patch/kernel/rockchip-rk3588-edge/0041-panthor-v6.patch b/patch/kernel/rockchip-rk3588-edge/0041-panthor-v6.patch new file mode 100644 index 0000000000..bffe36dcd8 --- /dev/null +++ b/patch/kernel/rockchip-rk3588-edge/0041-panthor-v6.patch @@ -0,0 +1,14855 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:15 +0100 +Subject: drm/panthor: Add uAPI + +Panthor follows the lead of other recently submitted drivers with +ioctls allowing us to support modern Vulkan features, like sparse memory +binding: + +- Pretty standard GEM management ioctls (BO_CREATE and BO_MMAP_OFFSET), + with the 'exclusive-VM' bit to speed-up BO reservation on job submission +- VM management ioctls (VM_CREATE, VM_DESTROY and VM_BIND). The VM_BIND + ioctl is loosely based on the Xe model, and can handle both + asynchronous and synchronous requests +- GPU execution context creation/destruction, tiler heap context creation + and job submission. Those ioctls reflect how the hardware/scheduler + works and are thus driver specific. + +We also have a way to expose IO regions, such that the usermode driver +can directly access specific/well-isolate registers, like the +LATEST_FLUSH register used to implement cache-flush reduction. + +This uAPI intentionally keeps usermode queues out of the scope, which +explains why doorbell registers and command stream ring-buffers are not +directly exposed to userspace. + +v6: +- Add Maxime's and Heiko's acks + +v5: +- Fix typo +- Add Liviu's R-b + +v4: +- Add a VM_GET_STATE ioctl +- Fix doc +- Expose the CORE_FEATURES register so we can deal with variants in the + UMD +- Add Steve's R-b + +v3: +- Add the concept of sync-only VM operation +- Fix support for 32-bit userspace +- Rework drm_panthor_vm_create to pass the user VA size instead of + the kernel VA size (suggested by Robin Murphy) +- Typo fixes +- Explicitly cast enums with top bit set to avoid compiler warnings in + -pedantic mode. +- Drop property core_group_count as it can be easily calculated by the + number of bits set in l2_present. + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Reviewed-by: Steven Price +Reviewed-by: Liviu Dudau +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + Documentation/gpu/driver-uapi.rst | 5 + + include/uapi/drm/panthor_drm.h | 945 ++++++++++ + 2 files changed, 950 insertions(+) + +diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst +index e5070a0e95ab..971cdb4816fc 100644 +--- a/Documentation/gpu/driver-uapi.rst ++++ b/Documentation/gpu/driver-uapi.rst +@@ -18,6 +18,11 @@ VM_BIND / EXEC uAPI + + .. kernel-doc:: include/uapi/drm/nouveau_drm.h + ++drm/panthor uAPI ++================ ++ ++.. kernel-doc:: include/uapi/drm/panthor_drm.h ++ + drm/xe uAPI + =========== + +diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h +new file mode 100644 +index 000000000000..373df80f41ed +--- /dev/null ++++ b/include/uapi/drm/panthor_drm.h +@@ -0,0 +1,945 @@ ++/* SPDX-License-Identifier: MIT */ ++/* Copyright (C) 2023 Collabora ltd. */ ++#ifndef _PANTHOR_DRM_H_ ++#define _PANTHOR_DRM_H_ ++ ++#include "drm.h" ++ ++#if defined(__cplusplus) ++extern "C" { ++#endif ++ ++/** ++ * DOC: Introduction ++ * ++ * This documentation describes the Panthor IOCTLs. ++ * ++ * Just a few generic rules about the data passed to the Panthor IOCTLs: ++ * ++ * - Structures must be aligned on 64-bit/8-byte. If the object is not ++ * naturally aligned, a padding field must be added. ++ * - Fields must be explicitly aligned to their natural type alignment with ++ * pad[0..N] fields. ++ * - All padding fields will be checked by the driver to make sure they are ++ * zeroed. ++ * - Flags can be added, but not removed/replaced. ++ * - New fields can be added to the main structures (the structures ++ * directly passed to the ioctl). Those fields can be added at the end of ++ * the structure, or replace existing padding fields. Any new field being ++ * added must preserve the behavior that existed before those fields were ++ * added when a value of zero is passed. ++ * - New fields can be added to indirect objects (objects pointed by the ++ * main structure), iff those objects are passed a size to reflect the ++ * size known by the userspace driver (see drm_panthor_obj_array::stride ++ * or drm_panthor_dev_query::size). ++ * - If the kernel driver is too old to know some fields, those will be ++ * ignored if zero, and otherwise rejected (and so will be zero on output). ++ * - If userspace is too old to know some fields, those will be zeroed ++ * (input) before the structure is parsed by the kernel driver. ++ * - Each new flag/field addition must come with a driver version update so ++ * the userspace driver doesn't have to trial and error to know which ++ * flags are supported. ++ * - Structures should not contain unions, as this would defeat the ++ * extensibility of such structures. ++ * - IOCTLs can't be removed or replaced. New IOCTL IDs should be placed ++ * at the end of the drm_panthor_ioctl_id enum. ++ */ ++ ++/** ++ * DOC: MMIO regions exposed to userspace. ++ * ++ * .. c:macro:: DRM_PANTHOR_USER_MMIO_OFFSET ++ * ++ * File offset for all MMIO regions being exposed to userspace. Don't use ++ * this value directly, use DRM_PANTHOR_USER__OFFSET values instead. ++ * pgoffset passed to mmap2() is an unsigned long, which forces us to use a ++ * different offset on 32-bit and 64-bit systems. ++ * ++ * .. c:macro:: DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET ++ * ++ * File offset for the LATEST_FLUSH_ID register. The Userspace driver controls ++ * GPU cache flushing through CS instructions, but the flush reduction ++ * mechanism requires a flush_id. This flush_id could be queried with an ++ * ioctl, but Arm provides a well-isolated register page containing only this ++ * read-only register, so let's expose this page through a static mmap offset ++ * and allow direct mapping of this MMIO region so we can avoid the ++ * user <-> kernel round-trip. ++ */ ++#define DRM_PANTHOR_USER_MMIO_OFFSET_32BIT (1ull << 43) ++#define DRM_PANTHOR_USER_MMIO_OFFSET_64BIT (1ull << 56) ++#define DRM_PANTHOR_USER_MMIO_OFFSET (sizeof(unsigned long) < 8 ? \ ++ DRM_PANTHOR_USER_MMIO_OFFSET_32BIT : \ ++ DRM_PANTHOR_USER_MMIO_OFFSET_64BIT) ++#define DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET (DRM_PANTHOR_USER_MMIO_OFFSET | 0) ++ ++/** ++ * DOC: IOCTL IDs ++ * ++ * enum drm_panthor_ioctl_id - IOCTL IDs ++ * ++ * Place new ioctls at the end, don't re-order, don't replace or remove entries. ++ * ++ * These IDs are not meant to be used directly. Use the DRM_IOCTL_PANTHOR_xxx ++ * definitions instead. ++ */ ++enum drm_panthor_ioctl_id { ++ /** @DRM_PANTHOR_DEV_QUERY: Query device information. */ ++ DRM_PANTHOR_DEV_QUERY = 0, ++ ++ /** @DRM_PANTHOR_VM_CREATE: Create a VM. */ ++ DRM_PANTHOR_VM_CREATE, ++ ++ /** @DRM_PANTHOR_VM_DESTROY: Destroy a VM. */ ++ DRM_PANTHOR_VM_DESTROY, ++ ++ /** @DRM_PANTHOR_VM_BIND: Bind/unbind memory to a VM. */ ++ DRM_PANTHOR_VM_BIND, ++ ++ /** @DRM_PANTHOR_VM_GET_STATE: Get VM state. */ ++ DRM_PANTHOR_VM_GET_STATE, ++ ++ /** @DRM_PANTHOR_BO_CREATE: Create a buffer object. */ ++ DRM_PANTHOR_BO_CREATE, ++ ++ /** ++ * @DRM_PANTHOR_BO_MMAP_OFFSET: Get the file offset to pass to ++ * mmap to map a GEM object. ++ */ ++ DRM_PANTHOR_BO_MMAP_OFFSET, ++ ++ /** @DRM_PANTHOR_GROUP_CREATE: Create a scheduling group. */ ++ DRM_PANTHOR_GROUP_CREATE, ++ ++ /** @DRM_PANTHOR_GROUP_DESTROY: Destroy a scheduling group. */ ++ DRM_PANTHOR_GROUP_DESTROY, ++ ++ /** ++ * @DRM_PANTHOR_GROUP_SUBMIT: Submit jobs to queues belonging ++ * to a specific scheduling group. ++ */ ++ DRM_PANTHOR_GROUP_SUBMIT, ++ ++ /** @DRM_PANTHOR_GROUP_GET_STATE: Get the state of a scheduling group. */ ++ DRM_PANTHOR_GROUP_GET_STATE, ++ ++ /** @DRM_PANTHOR_TILER_HEAP_CREATE: Create a tiler heap. */ ++ DRM_PANTHOR_TILER_HEAP_CREATE, ++ ++ /** @DRM_PANTHOR_TILER_HEAP_DESTROY: Destroy a tiler heap. */ ++ DRM_PANTHOR_TILER_HEAP_DESTROY, ++}; ++ ++/** ++ * DRM_IOCTL_PANTHOR() - Build a Panthor IOCTL number ++ * @__access: Access type. Must be R, W or RW. ++ * @__id: One of the DRM_PANTHOR_xxx id. ++ * @__type: Suffix of the type being passed to the IOCTL. ++ * ++ * Don't use this macro directly, use the DRM_IOCTL_PANTHOR_xxx ++ * values instead. ++ * ++ * Return: An IOCTL number to be passed to ioctl() from userspace. ++ */ ++#define DRM_IOCTL_PANTHOR(__access, __id, __type) \ ++ DRM_IO ## __access(DRM_COMMAND_BASE + DRM_PANTHOR_ ## __id, \ ++ struct drm_panthor_ ## __type) ++ ++#define DRM_IOCTL_PANTHOR_DEV_QUERY \ ++ DRM_IOCTL_PANTHOR(WR, DEV_QUERY, dev_query) ++#define DRM_IOCTL_PANTHOR_VM_CREATE \ ++ DRM_IOCTL_PANTHOR(WR, VM_CREATE, vm_create) ++#define DRM_IOCTL_PANTHOR_VM_DESTROY \ ++ DRM_IOCTL_PANTHOR(WR, VM_DESTROY, vm_destroy) ++#define DRM_IOCTL_PANTHOR_VM_BIND \ ++ DRM_IOCTL_PANTHOR(WR, VM_BIND, vm_bind) ++#define DRM_IOCTL_PANTHOR_VM_GET_STATE \ ++ DRM_IOCTL_PANTHOR(WR, VM_GET_STATE, vm_get_state) ++#define DRM_IOCTL_PANTHOR_BO_CREATE \ ++ DRM_IOCTL_PANTHOR(WR, BO_CREATE, bo_create) ++#define DRM_IOCTL_PANTHOR_BO_MMAP_OFFSET \ ++ DRM_IOCTL_PANTHOR(WR, BO_MMAP_OFFSET, bo_mmap_offset) ++#define DRM_IOCTL_PANTHOR_GROUP_CREATE \ ++ DRM_IOCTL_PANTHOR(WR, GROUP_CREATE, group_create) ++#define DRM_IOCTL_PANTHOR_GROUP_DESTROY \ ++ DRM_IOCTL_PANTHOR(WR, GROUP_DESTROY, group_destroy) ++#define DRM_IOCTL_PANTHOR_GROUP_SUBMIT \ ++ DRM_IOCTL_PANTHOR(WR, GROUP_SUBMIT, group_submit) ++#define DRM_IOCTL_PANTHOR_GROUP_GET_STATE \ ++ DRM_IOCTL_PANTHOR(WR, GROUP_GET_STATE, group_get_state) ++#define DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE \ ++ DRM_IOCTL_PANTHOR(WR, TILER_HEAP_CREATE, tiler_heap_create) ++#define DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY \ ++ DRM_IOCTL_PANTHOR(WR, TILER_HEAP_DESTROY, tiler_heap_destroy) ++ ++/** ++ * DOC: IOCTL arguments ++ */ ++ ++/** ++ * struct drm_panthor_obj_array - Object array. ++ * ++ * This object is used to pass an array of objects whose size is subject to changes in ++ * future versions of the driver. In order to support this mutability, we pass a stride ++ * describing the size of the object as known by userspace. ++ * ++ * You shouldn't fill drm_panthor_obj_array fields directly. You should instead use ++ * the DRM_PANTHOR_OBJ_ARRAY() macro that takes care of initializing the stride to ++ * the object size. ++ */ ++struct drm_panthor_obj_array { ++ /** @stride: Stride of object struct. Used for versioning. */ ++ __u32 stride; ++ ++ /** @count: Number of objects in the array. */ ++ __u32 count; ++ ++ /** @array: User pointer to an array of objects. */ ++ __u64 array; ++}; ++ ++/** ++ * DRM_PANTHOR_OBJ_ARRAY() - Initialize a drm_panthor_obj_array field. ++ * @cnt: Number of elements in the array. ++ * @ptr: Pointer to the array to pass to the kernel. ++ * ++ * Macro initializing a drm_panthor_obj_array based on the object size as known ++ * by userspace. ++ */ ++#define DRM_PANTHOR_OBJ_ARRAY(cnt, ptr) \ ++ { .stride = sizeof((ptr)[0]), .count = (cnt), .array = (__u64)(uintptr_t)(ptr) } ++ ++/** ++ * enum drm_panthor_sync_op_flags - Synchronization operation flags. ++ */ ++enum drm_panthor_sync_op_flags { ++ /** @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK: Synchronization handle type mask. */ ++ DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK = 0xff, ++ ++ /** @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ: Synchronization object type. */ ++ DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ = 0, ++ ++ /** ++ * @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ: Timeline synchronization ++ * object type. ++ */ ++ DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ = 1, ++ ++ /** @DRM_PANTHOR_SYNC_OP_WAIT: Wait operation. */ ++ DRM_PANTHOR_SYNC_OP_WAIT = 0 << 31, ++ ++ /** @DRM_PANTHOR_SYNC_OP_SIGNAL: Signal operation. */ ++ DRM_PANTHOR_SYNC_OP_SIGNAL = (int)(1u << 31), ++}; ++ ++/** ++ * struct drm_panthor_sync_op - Synchronization operation. ++ */ ++struct drm_panthor_sync_op { ++ /** @flags: Synchronization operation flags. Combination of DRM_PANTHOR_SYNC_OP values. */ ++ __u32 flags; ++ ++ /** @handle: Sync handle. */ ++ __u32 handle; ++ ++ /** ++ * @timeline_value: MBZ if ++ * (flags & DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK) != ++ * DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ. ++ */ ++ __u64 timeline_value; ++}; ++ ++/** ++ * enum drm_panthor_dev_query_type - Query type ++ * ++ * Place new types at the end, don't re-order, don't remove or replace. ++ */ ++enum drm_panthor_dev_query_type { ++ /** @DRM_PANTHOR_DEV_QUERY_GPU_INFO: Query GPU information. */ ++ DRM_PANTHOR_DEV_QUERY_GPU_INFO = 0, ++ ++ /** @DRM_PANTHOR_DEV_QUERY_CSIF_INFO: Query command-stream interface information. */ ++ DRM_PANTHOR_DEV_QUERY_CSIF_INFO, ++}; ++ ++/** ++ * struct drm_panthor_gpu_info - GPU information ++ * ++ * Structure grouping all queryable information relating to the GPU. ++ */ ++struct drm_panthor_gpu_info { ++ /** @gpu_id : GPU ID. */ ++ __u32 gpu_id; ++#define DRM_PANTHOR_ARCH_MAJOR(x) ((x) >> 28) ++#define DRM_PANTHOR_ARCH_MINOR(x) (((x) >> 24) & 0xf) ++#define DRM_PANTHOR_ARCH_REV(x) (((x) >> 20) & 0xf) ++#define DRM_PANTHOR_PRODUCT_MAJOR(x) (((x) >> 16) & 0xf) ++#define DRM_PANTHOR_VERSION_MAJOR(x) (((x) >> 12) & 0xf) ++#define DRM_PANTHOR_VERSION_MINOR(x) (((x) >> 4) & 0xff) ++#define DRM_PANTHOR_VERSION_STATUS(x) ((x) & 0xf) ++ ++ /** @gpu_rev: GPU revision. */ ++ __u32 gpu_rev; ++ ++ /** @csf_id: Command stream frontend ID. */ ++ __u32 csf_id; ++#define DRM_PANTHOR_CSHW_MAJOR(x) (((x) >> 26) & 0x3f) ++#define DRM_PANTHOR_CSHW_MINOR(x) (((x) >> 20) & 0x3f) ++#define DRM_PANTHOR_CSHW_REV(x) (((x) >> 16) & 0xf) ++#define DRM_PANTHOR_MCU_MAJOR(x) (((x) >> 10) & 0x3f) ++#define DRM_PANTHOR_MCU_MINOR(x) (((x) >> 4) & 0x3f) ++#define DRM_PANTHOR_MCU_REV(x) ((x) & 0xf) ++ ++ /** @l2_features: L2-cache features. */ ++ __u32 l2_features; ++ ++ /** @tiler_features: Tiler features. */ ++ __u32 tiler_features; ++ ++ /** @mem_features: Memory features. */ ++ __u32 mem_features; ++ ++ /** @mmu_features: MMU features. */ ++ __u32 mmu_features; ++#define DRM_PANTHOR_MMU_VA_BITS(x) ((x) & 0xff) ++ ++ /** @thread_features: Thread features. */ ++ __u32 thread_features; ++ ++ /** @max_threads: Maximum number of threads. */ ++ __u32 max_threads; ++ ++ /** @thread_max_workgroup_size: Maximum workgroup size. */ ++ __u32 thread_max_workgroup_size; ++ ++ /** ++ * @thread_max_barrier_size: Maximum number of threads that can wait ++ * simultaneously on a barrier. ++ */ ++ __u32 thread_max_barrier_size; ++ ++ /** @coherency_features: Coherency features. */ ++ __u32 coherency_features; ++ ++ /** @texture_features: Texture features. */ ++ __u32 texture_features[4]; ++ ++ /** @as_present: Bitmask encoding the number of address-space exposed by the MMU. */ ++ __u32 as_present; ++ ++ /** @shader_present: Bitmask encoding the shader cores exposed by the GPU. */ ++ __u64 shader_present; ++ ++ /** @l2_present: Bitmask encoding the L2 caches exposed by the GPU. */ ++ __u64 l2_present; ++ ++ /** @tiler_present: Bitmask encoding the tiler units exposed by the GPU. */ ++ __u64 tiler_present; ++ ++ /* @core_features: Used to discriminate core variants when they exist. */ ++ __u32 core_features; ++ ++ /* @pad: MBZ. */ ++ __u32 pad; ++}; ++ ++/** ++ * struct drm_panthor_csif_info - Command stream interface information ++ * ++ * Structure grouping all queryable information relating to the command stream interface. ++ */ ++struct drm_panthor_csif_info { ++ /** @csg_slot_count: Number of command stream group slots exposed by the firmware. */ ++ __u32 csg_slot_count; ++ ++ /** @cs_slot_count: Number of command stream slots per group. */ ++ __u32 cs_slot_count; ++ ++ /** @cs_reg_count: Number of command stream registers. */ ++ __u32 cs_reg_count; ++ ++ /** @scoreboard_slot_count: Number of scoreboard slots. */ ++ __u32 scoreboard_slot_count; ++ ++ /** ++ * @unpreserved_cs_reg_count: Number of command stream registers reserved by ++ * the kernel driver to call a userspace command stream. ++ * ++ * All registers can be used by a userspace command stream, but the ++ * [cs_slot_count - unpreserved_cs_reg_count .. cs_slot_count] registers are ++ * used by the kernel when DRM_PANTHOR_IOCTL_GROUP_SUBMIT is called. ++ */ ++ __u32 unpreserved_cs_reg_count; ++ ++ /** ++ * @pad: Padding field, set to zero. ++ */ ++ __u32 pad; ++}; ++ ++/** ++ * struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY ++ */ ++struct drm_panthor_dev_query { ++ /** @type: the query type (see drm_panthor_dev_query_type). */ ++ __u32 type; ++ ++ /** ++ * @size: size of the type being queried. ++ * ++ * If pointer is NULL, size is updated by the driver to provide the ++ * output structure size. If pointer is not NULL, the driver will ++ * only copy min(size, actual_structure_size) bytes to the pointer, ++ * and update the size accordingly. This allows us to extend query ++ * types without breaking userspace. ++ */ ++ __u32 size; ++ ++ /** ++ * @pointer: user pointer to a query type struct. ++ * ++ * Pointer can be NULL, in which case, nothing is copied, but the ++ * actual structure size is returned. If not NULL, it must point to ++ * a location that's large enough to hold size bytes. ++ */ ++ __u64 pointer; ++}; ++ ++/** ++ * struct drm_panthor_vm_create - Arguments passed to DRM_PANTHOR_IOCTL_VM_CREATE ++ */ ++struct drm_panthor_vm_create { ++ /** @flags: VM flags, MBZ. */ ++ __u32 flags; ++ ++ /** @id: Returned VM ID. */ ++ __u32 id; ++ ++ /** ++ * @user_va_range: Size of the VA space reserved for user objects. ++ * ++ * The kernel will pick the remaining space to map kernel-only objects to the ++ * VM (heap chunks, heap context, ring buffers, kernel synchronization objects, ++ * ...). If the space left for kernel objects is too small, kernel object ++ * allocation will fail further down the road. One can use ++ * drm_panthor_gpu_info::mmu_features to extract the total virtual address ++ * range, and chose a user_va_range that leaves some space to the kernel. ++ * ++ * If user_va_range is zero, the kernel will pick a sensible value based on ++ * TASK_SIZE and the virtual range supported by the GPU MMU (the kernel/user ++ * split should leave enough VA space for userspace processes to support SVM, ++ * while still allowing the kernel to map some amount of kernel objects in ++ * the kernel VA range). The value chosen by the driver will be returned in ++ * @user_va_range. ++ * ++ * User VA space always starts at 0x0, kernel VA space is always placed after ++ * the user VA range. ++ */ ++ __u64 user_va_range; ++}; ++ ++/** ++ * struct drm_panthor_vm_destroy - Arguments passed to DRM_PANTHOR_IOCTL_VM_DESTROY ++ */ ++struct drm_panthor_vm_destroy { ++ /** @id: ID of the VM to destroy. */ ++ __u32 id; ++ ++ /** @pad: MBZ. */ ++ __u32 pad; ++}; ++ ++/** ++ * enum drm_panthor_vm_bind_op_flags - VM bind operation flags ++ */ ++enum drm_panthor_vm_bind_op_flags { ++ /** ++ * @DRM_PANTHOR_VM_BIND_OP_MAP_READONLY: Map the memory read-only. ++ * ++ * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP. ++ */ ++ DRM_PANTHOR_VM_BIND_OP_MAP_READONLY = 1 << 0, ++ ++ /** ++ * @DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC: Map the memory not-executable. ++ * ++ * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP. ++ */ ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC = 1 << 1, ++ ++ /** ++ * @DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED: Map the memory uncached. ++ * ++ * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP. ++ */ ++ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED = 1 << 2, ++ ++ /** ++ * @DRM_PANTHOR_VM_BIND_OP_TYPE_MASK: Mask used to determine the type of operation. ++ */ ++ DRM_PANTHOR_VM_BIND_OP_TYPE_MASK = (int)(0xfu << 28), ++ ++ /** @DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: Map operation. */ ++ DRM_PANTHOR_VM_BIND_OP_TYPE_MAP = 0 << 28, ++ ++ /** @DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP: Unmap operation. */ ++ DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP = 1 << 28, ++ ++ /** ++ * @DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY: No VM operation. ++ * ++ * Just serves as a synchronization point on a VM queue. ++ * ++ * Only valid if %DRM_PANTHOR_VM_BIND_ASYNC is set in drm_panthor_vm_bind::flags, ++ * and drm_panthor_vm_bind_op::syncs contains at least one element. ++ */ ++ DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY = 2 << 28, ++}; ++ ++/** ++ * struct drm_panthor_vm_bind_op - VM bind operation ++ */ ++struct drm_panthor_vm_bind_op { ++ /** @flags: Combination of drm_panthor_vm_bind_op_flags flags. */ ++ __u32 flags; ++ ++ /** ++ * @bo_handle: Handle of the buffer object to map. ++ * MBZ for unmap or sync-only operations. ++ */ ++ __u32 bo_handle; ++ ++ /** ++ * @bo_offset: Buffer object offset. ++ * MBZ for unmap or sync-only operations. ++ */ ++ __u64 bo_offset; ++ ++ /** ++ * @va: Virtual address to map/unmap. ++ * MBZ for sync-only operations. ++ */ ++ __u64 va; ++ ++ /** ++ * @size: Size to map/unmap. ++ * MBZ for sync-only operations. ++ */ ++ __u64 size; ++ ++ /** ++ * @syncs: Array of struct drm_panthor_sync_op synchronization ++ * operations. ++ * ++ * This array must be empty if %DRM_PANTHOR_VM_BIND_ASYNC is not set on ++ * the drm_panthor_vm_bind object containing this VM bind operation. ++ * ++ * This array shall not be empty for sync-only operations. ++ */ ++ struct drm_panthor_obj_array syncs; ++ ++}; ++ ++/** ++ * enum drm_panthor_vm_bind_flags - VM bind flags ++ */ ++enum drm_panthor_vm_bind_flags { ++ /** ++ * @DRM_PANTHOR_VM_BIND_ASYNC: VM bind operations are queued to the VM ++ * queue instead of being executed synchronously. ++ */ ++ DRM_PANTHOR_VM_BIND_ASYNC = 1 << 0, ++}; ++ ++/** ++ * struct drm_panthor_vm_bind - Arguments passed to DRM_IOCTL_PANTHOR_VM_BIND ++ */ ++struct drm_panthor_vm_bind { ++ /** @vm_id: VM targeted by the bind request. */ ++ __u32 vm_id; ++ ++ /** @flags: Combination of drm_panthor_vm_bind_flags flags. */ ++ __u32 flags; ++ ++ /** @ops: Array of struct drm_panthor_vm_bind_op bind operations. */ ++ struct drm_panthor_obj_array ops; ++}; ++ ++/** ++ * enum drm_panthor_vm_state - VM states. ++ */ ++enum drm_panthor_vm_state { ++ /** ++ * @DRM_PANTHOR_VM_STATE_USABLE: VM is usable. ++ * ++ * New VM operations will be accepted on this VM. ++ */ ++ DRM_PANTHOR_VM_STATE_USABLE, ++ ++ /** ++ * @DRM_PANTHOR_VM_STATE_UNUSABLE: VM is unusable. ++ * ++ * Something put the VM in an unusable state (like an asynchronous ++ * VM_BIND request failing for any reason). ++ * ++ * Once the VM is in this state, all new MAP operations will be ++ * rejected, and any GPU job targeting this VM will fail. ++ * UNMAP operations are still accepted. ++ * ++ * The only way to recover from an unusable VM is to create a new ++ * VM, and destroy the old one. ++ */ ++ DRM_PANTHOR_VM_STATE_UNUSABLE, ++}; ++ ++/** ++ * struct drm_panthor_vm_get_state - Get VM state. ++ */ ++struct drm_panthor_vm_get_state { ++ /** @vm_id: VM targeted by the get_state request. */ ++ __u32 vm_id; ++ ++ /** ++ * @state: state returned by the driver. ++ * ++ * Must be one of the enum drm_panthor_vm_state values. ++ */ ++ __u32 state; ++}; ++ ++/** ++ * enum drm_panthor_bo_flags - Buffer object flags, passed at creation time. ++ */ ++enum drm_panthor_bo_flags { ++ /** @DRM_PANTHOR_BO_NO_MMAP: The buffer object will never be CPU-mapped in userspace. */ ++ DRM_PANTHOR_BO_NO_MMAP = (1 << 0), ++}; ++ ++/** ++ * struct drm_panthor_bo_create - Arguments passed to DRM_IOCTL_PANTHOR_BO_CREATE. ++ */ ++struct drm_panthor_bo_create { ++ /** ++ * @size: Requested size for the object ++ * ++ * The (page-aligned) allocated size for the object will be returned. ++ */ ++ __u64 size; ++ ++ /** ++ * @flags: Flags. Must be a combination of drm_panthor_bo_flags flags. ++ */ ++ __u32 flags; ++ ++ /** ++ * @exclusive_vm_id: Exclusive VM this buffer object will be mapped to. ++ * ++ * If not zero, the field must refer to a valid VM ID, and implies that: ++ * - the buffer object will only ever be bound to that VM ++ * - cannot be exported as a PRIME fd ++ */ ++ __u32 exclusive_vm_id; ++ ++ /** ++ * @handle: Returned handle for the object. ++ * ++ * Object handles are nonzero. ++ */ ++ __u32 handle; ++ ++ /** @pad: MBZ. */ ++ __u32 pad; ++}; ++ ++/** ++ * struct drm_panthor_bo_mmap_offset - Arguments passed to DRM_IOCTL_PANTHOR_BO_MMAP_OFFSET. ++ */ ++struct drm_panthor_bo_mmap_offset { ++ /** @handle: Handle of the object we want an mmap offset for. */ ++ __u32 handle; ++ ++ /** @pad: MBZ. */ ++ __u32 pad; ++ ++ /** @offset: The fake offset to use for subsequent mmap calls. */ ++ __u64 offset; ++}; ++ ++/** ++ * struct drm_panthor_queue_create - Queue creation arguments. ++ */ ++struct drm_panthor_queue_create { ++ /** ++ * @priority: Defines the priority of queues inside a group. Goes from 0 to 15, ++ * 15 being the highest priority. ++ */ ++ __u8 priority; ++ ++ /** @pad: Padding fields, MBZ. */ ++ __u8 pad[3]; ++ ++ /** @ringbuf_size: Size of the ring buffer to allocate to this queue. */ ++ __u32 ringbuf_size; ++}; ++ ++/** ++ * enum drm_panthor_group_priority - Scheduling group priority ++ */ ++enum drm_panthor_group_priority { ++ /** @PANTHOR_GROUP_PRIORITY_LOW: Low priority group. */ ++ PANTHOR_GROUP_PRIORITY_LOW = 0, ++ ++ /** @PANTHOR_GROUP_PRIORITY_MEDIUM: Medium priority group. */ ++ PANTHOR_GROUP_PRIORITY_MEDIUM, ++ ++ /** @PANTHOR_GROUP_PRIORITY_HIGH: High priority group. */ ++ PANTHOR_GROUP_PRIORITY_HIGH, ++}; ++ ++/** ++ * struct drm_panthor_group_create - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_CREATE ++ */ ++struct drm_panthor_group_create { ++ /** @queues: Array of drm_panthor_queue_create elements. */ ++ struct drm_panthor_obj_array queues; ++ ++ /** ++ * @max_compute_cores: Maximum number of cores that can be used by compute ++ * jobs across CS queues bound to this group. ++ * ++ * Must be less or equal to the number of bits set in @compute_core_mask. ++ */ ++ __u8 max_compute_cores; ++ ++ /** ++ * @max_fragment_cores: Maximum number of cores that can be used by fragment ++ * jobs across CS queues bound to this group. ++ * ++ * Must be less or equal to the number of bits set in @fragment_core_mask. ++ */ ++ __u8 max_fragment_cores; ++ ++ /** ++ * @max_tiler_cores: Maximum number of tilers that can be used by tiler jobs ++ * across CS queues bound to this group. ++ * ++ * Must be less or equal to the number of bits set in @tiler_core_mask. ++ */ ++ __u8 max_tiler_cores; ++ ++ /** @priority: Group priority (see enum drm_panthor_group_priority). */ ++ __u8 priority; ++ ++ /** @pad: Padding field, MBZ. */ ++ __u32 pad; ++ ++ /** ++ * @compute_core_mask: Mask encoding cores that can be used for compute jobs. ++ * ++ * This field must have at least @max_compute_cores bits set. ++ * ++ * The bits set here should also be set in drm_panthor_gpu_info::shader_present. ++ */ ++ __u64 compute_core_mask; ++ ++ /** ++ * @fragment_core_mask: Mask encoding cores that can be used for fragment jobs. ++ * ++ * This field must have at least @max_fragment_cores bits set. ++ * ++ * The bits set here should also be set in drm_panthor_gpu_info::shader_present. ++ */ ++ __u64 fragment_core_mask; ++ ++ /** ++ * @tiler_core_mask: Mask encoding cores that can be used for tiler jobs. ++ * ++ * This field must have at least @max_tiler_cores bits set. ++ * ++ * The bits set here should also be set in drm_panthor_gpu_info::tiler_present. ++ */ ++ __u64 tiler_core_mask; ++ ++ /** ++ * @vm_id: VM ID to bind this group to. ++ * ++ * All submission to queues bound to this group will use this VM. ++ */ ++ __u32 vm_id; ++ ++ /** ++ * @group_handle: Returned group handle. Passed back when submitting jobs or ++ * destroying a group. ++ */ ++ __u32 group_handle; ++}; ++ ++/** ++ * struct drm_panthor_group_destroy - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_DESTROY ++ */ ++struct drm_panthor_group_destroy { ++ /** @group_handle: Group to destroy */ ++ __u32 group_handle; ++ ++ /** @pad: Padding field, MBZ. */ ++ __u32 pad; ++}; ++ ++/** ++ * struct drm_panthor_queue_submit - Job submission arguments. ++ * ++ * This is describing the userspace command stream to call from the kernel ++ * command stream ring-buffer. Queue submission is always part of a group ++ * submission, taking one or more jobs to submit to the underlying queues. ++ */ ++struct drm_panthor_queue_submit { ++ /** @queue_index: Index of the queue inside a group. */ ++ __u32 queue_index; ++ ++ /** ++ * @stream_size: Size of the command stream to execute. ++ * ++ * Must be 64-bit/8-byte aligned (the size of a CS instruction) ++ * ++ * Can be zero if stream_addr is zero too. ++ */ ++ __u32 stream_size; ++ ++ /** ++ * @stream_addr: GPU address of the command stream to execute. ++ * ++ * Must be aligned on 64-byte. ++ * ++ * Can be zero is stream_size is zero too. ++ */ ++ __u64 stream_addr; ++ ++ /** ++ * @latest_flush: FLUSH_ID read at the time the stream was built. ++ * ++ * This allows cache flush elimination for the automatic ++ * flush+invalidate(all) done at submission time, which is needed to ++ * ensure the GPU doesn't get garbage when reading the indirect command ++ * stream buffers. If you want the cache flush to happen ++ * unconditionally, pass a zero here. ++ */ ++ __u32 latest_flush; ++ ++ /** @pad: MBZ. */ ++ __u32 pad; ++ ++ /** @syncs: Array of struct drm_panthor_sync_op sync operations. */ ++ struct drm_panthor_obj_array syncs; ++}; ++ ++/** ++ * struct drm_panthor_group_submit - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_SUBMIT ++ */ ++struct drm_panthor_group_submit { ++ /** @group_handle: Handle of the group to queue jobs to. */ ++ __u32 group_handle; ++ ++ /** @pad: MBZ. */ ++ __u32 pad; ++ ++ /** @queue_submits: Array of drm_panthor_queue_submit objects. */ ++ struct drm_panthor_obj_array queue_submits; ++}; ++ ++/** ++ * enum drm_panthor_group_state_flags - Group state flags ++ */ ++enum drm_panthor_group_state_flags { ++ /** ++ * @DRM_PANTHOR_GROUP_STATE_TIMEDOUT: Group had unfinished jobs. ++ * ++ * When a group ends up with this flag set, no jobs can be submitted to its queues. ++ */ ++ DRM_PANTHOR_GROUP_STATE_TIMEDOUT = 1 << 0, ++ ++ /** ++ * @DRM_PANTHOR_GROUP_STATE_FATAL_FAULT: Group had fatal faults. ++ * ++ * When a group ends up with this flag set, no jobs can be submitted to its queues. ++ */ ++ DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1, ++}; ++ ++/** ++ * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE ++ * ++ * Used to query the state of a group and decide whether a new group should be created to ++ * replace it. ++ */ ++struct drm_panthor_group_get_state { ++ /** @group_handle: Handle of the group to query state on */ ++ __u32 group_handle; ++ ++ /** ++ * @state: Combination of DRM_PANTHOR_GROUP_STATE_* flags encoding the ++ * group state. ++ */ ++ __u32 state; ++ ++ /** @fatal_queues: Bitmask of queues that faced fatal faults. */ ++ __u32 fatal_queues; ++ ++ /** @pad: MBZ */ ++ __u32 pad; ++}; ++ ++/** ++ * struct drm_panthor_tiler_heap_create - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE ++ */ ++struct drm_panthor_tiler_heap_create { ++ /** @vm_id: VM ID the tiler heap should be mapped to */ ++ __u32 vm_id; ++ ++ /** @initial_chunk_count: Initial number of chunks to allocate. */ ++ __u32 initial_chunk_count; ++ ++ /** @chunk_size: Chunk size. Must be a power of two at least 256KB large. */ ++ __u32 chunk_size; ++ ++ /** @max_chunks: Maximum number of chunks that can be allocated. */ ++ __u32 max_chunks; ++ ++ /** ++ * @target_in_flight: Maximum number of in-flight render passes. ++ * ++ * If the heap has more than tiler jobs in-flight, the FW will wait for render ++ * passes to finish before queuing new tiler jobs. ++ */ ++ __u32 target_in_flight; ++ ++ /** @handle: Returned heap handle. Passed back to DESTROY_TILER_HEAP. */ ++ __u32 handle; ++ ++ /** @tiler_heap_ctx_gpu_va: Returned heap GPU virtual address returned */ ++ __u64 tiler_heap_ctx_gpu_va; ++ ++ /** ++ * @first_heap_chunk_gpu_va: First heap chunk. ++ * ++ * The tiler heap is formed of heap chunks forming a single-link list. This ++ * is the first element in the list. ++ */ ++ __u64 first_heap_chunk_gpu_va; ++}; ++ ++/** ++ * struct drm_panthor_tiler_heap_destroy - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY ++ */ ++struct drm_panthor_tiler_heap_destroy { ++ /** @handle: Handle of the tiler heap to destroy */ ++ __u32 handle; ++ ++ /** @pad: Padding field, MBZ. */ ++ __u32 pad; ++}; ++ ++#if defined(__cplusplus) ++} ++#endif ++ ++#endif /* _PANTHOR_DRM_H_ */ +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:16 +0100 +Subject: drm/panthor: Add GPU register definitions + +Those are the registers directly accessible through the MMIO range. + +FW registers are exposed in panthor_fw.h. + +v6: +- Add Maxime's and Heiko's acks + +v4: +- Add the CORE_FEATURES register (needed for GPU variants) +- Add Steve's R-b + +v3: +- Add macros to extract GPU ID info +- Formatting changes +- Remove AS_TRANSCFG_ADRMODE_LEGACY - it doesn't exist post-CSF +- Remove CSF_GPU_LATEST_FLUSH_ID_DEFAULT +- Add GPU_L2_FEATURES_LINE_SIZE for extracting the GPU cache line size + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_regs.h | 239 ++++++++++ + 1 file changed, 239 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_regs.h b/drivers/gpu/drm/panthor/panthor_regs.h +new file mode 100644 +index 000000000000..b7b3b3add166 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_regs.h +@@ -0,0 +1,239 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2018 Marty E. Plummer */ ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++/* ++ * Register definitions based on mali_kbase_gpu_regmap.h and ++ * mali_kbase_gpu_regmap_csf.h ++ * (C) COPYRIGHT 2010-2022 ARM Limited. All rights reserved. ++ */ ++#ifndef __PANTHOR_REGS_H__ ++#define __PANTHOR_REGS_H__ ++ ++#define GPU_ID 0x0 ++#define GPU_ARCH_MAJOR(x) ((x) >> 28) ++#define GPU_ARCH_MINOR(x) (((x) & GENMASK(27, 24)) >> 24) ++#define GPU_ARCH_REV(x) (((x) & GENMASK(23, 20)) >> 20) ++#define GPU_PROD_MAJOR(x) (((x) & GENMASK(19, 16)) >> 16) ++#define GPU_VER_MAJOR(x) (((x) & GENMASK(15, 12)) >> 12) ++#define GPU_VER_MINOR(x) (((x) & GENMASK(11, 4)) >> 4) ++#define GPU_VER_STATUS(x) ((x) & GENMASK(3, 0)) ++ ++#define GPU_L2_FEATURES 0x4 ++#define GPU_L2_FEATURES_LINE_SIZE(x) (1 << ((x) & GENMASK(7, 0))) ++ ++#define GPU_CORE_FEATURES 0x8 ++ ++#define GPU_TILER_FEATURES 0xC ++#define GPU_MEM_FEATURES 0x10 ++#define GROUPS_L2_COHERENT BIT(0) ++ ++#define GPU_MMU_FEATURES 0x14 ++#define GPU_MMU_FEATURES_VA_BITS(x) ((x) & GENMASK(7, 0)) ++#define GPU_MMU_FEATURES_PA_BITS(x) (((x) >> 8) & GENMASK(7, 0)) ++#define GPU_AS_PRESENT 0x18 ++#define GPU_CSF_ID 0x1C ++ ++#define GPU_INT_RAWSTAT 0x20 ++#define GPU_INT_CLEAR 0x24 ++#define GPU_INT_MASK 0x28 ++#define GPU_INT_STAT 0x2c ++#define GPU_IRQ_FAULT BIT(0) ++#define GPU_IRQ_PROTM_FAULT BIT(1) ++#define GPU_IRQ_RESET_COMPLETED BIT(8) ++#define GPU_IRQ_POWER_CHANGED BIT(9) ++#define GPU_IRQ_POWER_CHANGED_ALL BIT(10) ++#define GPU_IRQ_CLEAN_CACHES_COMPLETED BIT(17) ++#define GPU_IRQ_DOORBELL_MIRROR BIT(18) ++#define GPU_IRQ_MCU_STATUS_CHANGED BIT(19) ++#define GPU_CMD 0x30 ++#define GPU_CMD_DEF(type, payload) ((type) | ((payload) << 8)) ++#define GPU_SOFT_RESET GPU_CMD_DEF(1, 1) ++#define GPU_HARD_RESET GPU_CMD_DEF(1, 2) ++#define CACHE_CLEAN BIT(0) ++#define CACHE_INV BIT(1) ++#define GPU_FLUSH_CACHES(l2, lsc, oth) \ ++ GPU_CMD_DEF(4, ((l2) << 0) | ((lsc) << 4) | ((oth) << 8)) ++ ++#define GPU_STATUS 0x34 ++#define GPU_STATUS_ACTIVE BIT(0) ++#define GPU_STATUS_PWR_ACTIVE BIT(1) ++#define GPU_STATUS_PAGE_FAULT BIT(4) ++#define GPU_STATUS_PROTM_ACTIVE BIT(7) ++#define GPU_STATUS_DBG_ENABLED BIT(8) ++ ++#define GPU_FAULT_STATUS 0x3C ++#define GPU_FAULT_ADDR_LO 0x40 ++#define GPU_FAULT_ADDR_HI 0x44 ++ ++#define GPU_PWR_KEY 0x50 ++#define GPU_PWR_KEY_UNLOCK 0x2968A819 ++#define GPU_PWR_OVERRIDE0 0x54 ++#define GPU_PWR_OVERRIDE1 0x58 ++ ++#define GPU_TIMESTAMP_OFFSET_LO 0x88 ++#define GPU_TIMESTAMP_OFFSET_HI 0x8C ++#define GPU_CYCLE_COUNT_LO 0x90 ++#define GPU_CYCLE_COUNT_HI 0x94 ++#define GPU_TIMESTAMP_LO 0x98 ++#define GPU_TIMESTAMP_HI 0x9C ++ ++#define GPU_THREAD_MAX_THREADS 0xA0 ++#define GPU_THREAD_MAX_WORKGROUP_SIZE 0xA4 ++#define GPU_THREAD_MAX_BARRIER_SIZE 0xA8 ++#define GPU_THREAD_FEATURES 0xAC ++ ++#define GPU_TEXTURE_FEATURES(n) (0xB0 + ((n) * 4)) ++ ++#define GPU_SHADER_PRESENT_LO 0x100 ++#define GPU_SHADER_PRESENT_HI 0x104 ++#define GPU_TILER_PRESENT_LO 0x110 ++#define GPU_TILER_PRESENT_HI 0x114 ++#define GPU_L2_PRESENT_LO 0x120 ++#define GPU_L2_PRESENT_HI 0x124 ++ ++#define SHADER_READY_LO 0x140 ++#define SHADER_READY_HI 0x144 ++#define TILER_READY_LO 0x150 ++#define TILER_READY_HI 0x154 ++#define L2_READY_LO 0x160 ++#define L2_READY_HI 0x164 ++ ++#define SHADER_PWRON_LO 0x180 ++#define SHADER_PWRON_HI 0x184 ++#define TILER_PWRON_LO 0x190 ++#define TILER_PWRON_HI 0x194 ++#define L2_PWRON_LO 0x1A0 ++#define L2_PWRON_HI 0x1A4 ++ ++#define SHADER_PWROFF_LO 0x1C0 ++#define SHADER_PWROFF_HI 0x1C4 ++#define TILER_PWROFF_LO 0x1D0 ++#define TILER_PWROFF_HI 0x1D4 ++#define L2_PWROFF_LO 0x1E0 ++#define L2_PWROFF_HI 0x1E4 ++ ++#define SHADER_PWRTRANS_LO 0x200 ++#define SHADER_PWRTRANS_HI 0x204 ++#define TILER_PWRTRANS_LO 0x210 ++#define TILER_PWRTRANS_HI 0x214 ++#define L2_PWRTRANS_LO 0x220 ++#define L2_PWRTRANS_HI 0x224 ++ ++#define SHADER_PWRACTIVE_LO 0x240 ++#define SHADER_PWRACTIVE_HI 0x244 ++#define TILER_PWRACTIVE_LO 0x250 ++#define TILER_PWRACTIVE_HI 0x254 ++#define L2_PWRACTIVE_LO 0x260 ++#define L2_PWRACTIVE_HI 0x264 ++ ++#define GPU_REVID 0x280 ++ ++#define GPU_COHERENCY_FEATURES 0x300 ++#define GPU_COHERENCY_PROT_BIT(name) BIT(GPU_COHERENCY_ ## name) ++ ++#define GPU_COHERENCY_PROTOCOL 0x304 ++#define GPU_COHERENCY_ACE 0 ++#define GPU_COHERENCY_ACE_LITE 1 ++#define GPU_COHERENCY_NONE 31 ++ ++#define MCU_CONTROL 0x700 ++#define MCU_CONTROL_ENABLE 1 ++#define MCU_CONTROL_AUTO 2 ++#define MCU_CONTROL_DISABLE 0 ++ ++#define MCU_STATUS 0x704 ++#define MCU_STATUS_DISABLED 0 ++#define MCU_STATUS_ENABLED 1 ++#define MCU_STATUS_HALT 2 ++#define MCU_STATUS_FATAL 3 ++ ++/* Job Control regs */ ++#define JOB_INT_RAWSTAT 0x1000 ++#define JOB_INT_CLEAR 0x1004 ++#define JOB_INT_MASK 0x1008 ++#define JOB_INT_STAT 0x100c ++#define JOB_INT_GLOBAL_IF BIT(31) ++#define JOB_INT_CSG_IF(x) BIT(x) ++ ++/* MMU regs */ ++#define MMU_INT_RAWSTAT 0x2000 ++#define MMU_INT_CLEAR 0x2004 ++#define MMU_INT_MASK 0x2008 ++#define MMU_INT_STAT 0x200c ++ ++/* AS_COMMAND register commands */ ++ ++#define MMU_BASE 0x2400 ++#define MMU_AS_SHIFT 6 ++#define MMU_AS(as) (MMU_BASE + ((as) << MMU_AS_SHIFT)) ++ ++#define AS_TRANSTAB_LO(as) (MMU_AS(as) + 0x0) ++#define AS_TRANSTAB_HI(as) (MMU_AS(as) + 0x4) ++#define AS_MEMATTR_LO(as) (MMU_AS(as) + 0x8) ++#define AS_MEMATTR_HI(as) (MMU_AS(as) + 0xC) ++#define AS_MEMATTR_AARCH64_INNER_ALLOC_IMPL (2 << 2) ++#define AS_MEMATTR_AARCH64_INNER_ALLOC_EXPL(w, r) ((3 << 2) | \ ++ ((w) ? BIT(0) : 0) | \ ++ ((r) ? BIT(1) : 0)) ++#define AS_MEMATTR_AARCH64_SH_MIDGARD_INNER (0 << 4) ++#define AS_MEMATTR_AARCH64_SH_CPU_INNER (1 << 4) ++#define AS_MEMATTR_AARCH64_SH_CPU_INNER_SHADER_COH (2 << 4) ++#define AS_MEMATTR_AARCH64_SHARED (0 << 6) ++#define AS_MEMATTR_AARCH64_INNER_OUTER_NC (1 << 6) ++#define AS_MEMATTR_AARCH64_INNER_OUTER_WB (2 << 6) ++#define AS_MEMATTR_AARCH64_FAULT (3 << 6) ++#define AS_LOCKADDR_LO(as) (MMU_AS(as) + 0x10) ++#define AS_LOCKADDR_HI(as) (MMU_AS(as) + 0x14) ++#define AS_COMMAND(as) (MMU_AS(as) + 0x18) ++#define AS_COMMAND_NOP 0 ++#define AS_COMMAND_UPDATE 1 ++#define AS_COMMAND_LOCK 2 ++#define AS_COMMAND_UNLOCK 3 ++#define AS_COMMAND_FLUSH_PT 4 ++#define AS_COMMAND_FLUSH_MEM 5 ++#define AS_LOCK_REGION_MIN_SIZE (1ULL << 15) ++#define AS_FAULTSTATUS(as) (MMU_AS(as) + 0x1C) ++#define AS_FAULTSTATUS_ACCESS_TYPE_MASK (0x3 << 8) ++#define AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC (0x0 << 8) ++#define AS_FAULTSTATUS_ACCESS_TYPE_EX (0x1 << 8) ++#define AS_FAULTSTATUS_ACCESS_TYPE_READ (0x2 << 8) ++#define AS_FAULTSTATUS_ACCESS_TYPE_WRITE (0x3 << 8) ++#define AS_FAULTADDRESS_LO(as) (MMU_AS(as) + 0x20) ++#define AS_FAULTADDRESS_HI(as) (MMU_AS(as) + 0x24) ++#define AS_STATUS(as) (MMU_AS(as) + 0x28) ++#define AS_STATUS_AS_ACTIVE BIT(0) ++#define AS_TRANSCFG_LO(as) (MMU_AS(as) + 0x30) ++#define AS_TRANSCFG_HI(as) (MMU_AS(as) + 0x34) ++#define AS_TRANSCFG_ADRMODE_UNMAPPED (1 << 0) ++#define AS_TRANSCFG_ADRMODE_IDENTITY (2 << 0) ++#define AS_TRANSCFG_ADRMODE_AARCH64_4K (6 << 0) ++#define AS_TRANSCFG_ADRMODE_AARCH64_64K (8 << 0) ++#define AS_TRANSCFG_INA_BITS(x) ((x) << 6) ++#define AS_TRANSCFG_OUTA_BITS(x) ((x) << 14) ++#define AS_TRANSCFG_SL_CONCAT BIT(22) ++#define AS_TRANSCFG_PTW_MEMATTR_NC (1 << 24) ++#define AS_TRANSCFG_PTW_MEMATTR_WB (2 << 24) ++#define AS_TRANSCFG_PTW_SH_NS (0 << 28) ++#define AS_TRANSCFG_PTW_SH_OS (2 << 28) ++#define AS_TRANSCFG_PTW_SH_IS (3 << 28) ++#define AS_TRANSCFG_PTW_RA BIT(30) ++#define AS_TRANSCFG_DISABLE_HIER_AP BIT(33) ++#define AS_TRANSCFG_DISABLE_AF_FAULT BIT(34) ++#define AS_TRANSCFG_WXN BIT(35) ++#define AS_TRANSCFG_XREADABLE BIT(36) ++#define AS_FAULTEXTRA_LO(as) (MMU_AS(as) + 0x38) ++#define AS_FAULTEXTRA_HI(as) (MMU_AS(as) + 0x3C) ++ ++#define CSF_GPU_LATEST_FLUSH_ID 0x10000 ++ ++#define CSF_DOORBELL(i) (0x80000 + ((i) * 0x10000)) ++#define CSF_GLB_DOORBELL_ID 0 ++ ++#define gpu_write(dev, reg, data) \ ++ writel(data, (dev)->iomem + (reg)) ++ ++#define gpu_read(dev, reg) \ ++ readl((dev)->iomem + (reg)) ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:17 +0100 +Subject: drm/panthor: Add the device logical block + +The panthor driver is designed in a modular way, where each logical +block is dealing with a specific HW-block or software feature. In order +for those blocks to communicate with each other, we need a central +panthor_device collecting all the blocks, and exposing some common +features, like interrupt handling, power management, reset, ... + +This what this panthor_device logical block is about. + +v6: +- Add Maxime's and Heiko's acks +- Keep header inclusion alphabetically ordered + +v5: +- Suspend the MMU/GPU blocks if panthor_fw_resume() fails in + panthor_device_resume() +- Move the pm_runtime_use_autosuspend() call before drm_dev_register() +- Add Liviu's R-b + +v4: +- Check drmm_mutex_init() return code +- Fix panthor_device_reset_work() out path +- Fix the race in the unplug logic +- Fix typos +- Unplug blocks when something fails in panthor_device_init() +- Add Steve's R-b + +v3: +- Add acks for the MIT+GPL2 relicensing +- Fix 32-bit support +- Shorten the sections protected by panthor_device::pm::mmio_lock to fix + lock ordering issues. +- Rename panthor_device::pm::lock into panthor_device::pm::mmio_lock to + better reflect what this lock is protecting +- Use dev_err_probe() +- Make sure we call drm_dev_exit() when something fails half-way in + panthor_device_reset_work() +- Replace CSF_GPU_LATEST_FLUSH_ID_DEFAULT with a constant '1' and a + comment to explain. Also remove setting the dummy flush ID on suspend. +- Remove drm_WARN_ON() in panthor_exception_name() +- Check pirq->suspended in panthor_xxx_irq_raw_handler() + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Steven Price +Reviewed-by: Liviu Dudau +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_device.c | 549 ++++++++++ + drivers/gpu/drm/panthor/panthor_device.h | 394 +++++++ + 2 files changed, 943 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_device.c b/drivers/gpu/drm/panthor/panthor_device.c +new file mode 100644 +index 000000000000..bfe8da4a6e4c +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_device.c +@@ -0,0 +1,549 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2018 Marty E. Plummer */ ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "panthor_devfreq.h" ++#include "panthor_device.h" ++#include "panthor_fw.h" ++#include "panthor_gpu.h" ++#include "panthor_mmu.h" ++#include "panthor_regs.h" ++#include "panthor_sched.h" ++ ++static int panthor_clk_init(struct panthor_device *ptdev) ++{ ++ ptdev->clks.core = devm_clk_get(ptdev->base.dev, NULL); ++ if (IS_ERR(ptdev->clks.core)) ++ return dev_err_probe(ptdev->base.dev, ++ PTR_ERR(ptdev->clks.core), ++ "get 'core' clock failed"); ++ ++ ptdev->clks.stacks = devm_clk_get_optional(ptdev->base.dev, "stacks"); ++ if (IS_ERR(ptdev->clks.stacks)) ++ return dev_err_probe(ptdev->base.dev, ++ PTR_ERR(ptdev->clks.stacks), ++ "get 'stacks' clock failed"); ++ ++ ptdev->clks.coregroup = devm_clk_get_optional(ptdev->base.dev, "coregroup"); ++ if (IS_ERR(ptdev->clks.coregroup)) ++ return dev_err_probe(ptdev->base.dev, ++ PTR_ERR(ptdev->clks.coregroup), ++ "get 'coregroup' clock failed"); ++ ++ drm_info(&ptdev->base, "clock rate = %lu\n", clk_get_rate(ptdev->clks.core)); ++ return 0; ++} ++ ++void panthor_device_unplug(struct panthor_device *ptdev) ++{ ++ /* This function can be called from two different path: the reset work ++ * and the platform device remove callback. drm_dev_unplug() doesn't ++ * deal with concurrent callers, so we have to protect drm_dev_unplug() ++ * calls with our own lock, and bail out if the device is already ++ * unplugged. ++ */ ++ mutex_lock(&ptdev->unplug.lock); ++ if (drm_dev_is_unplugged(&ptdev->base)) { ++ /* Someone beat us, release the lock and wait for the unplug ++ * operation to be reported as done. ++ **/ ++ mutex_unlock(&ptdev->unplug.lock); ++ wait_for_completion(&ptdev->unplug.done); ++ return; ++ } ++ ++ /* Call drm_dev_unplug() so any access to HW blocks happening after ++ * that point get rejected. ++ */ ++ drm_dev_unplug(&ptdev->base); ++ ++ /* We do the rest of the unplug with the unplug lock released, ++ * future callers will wait on ptdev->unplug.done anyway. ++ */ ++ mutex_unlock(&ptdev->unplug.lock); ++ ++ drm_WARN_ON(&ptdev->base, pm_runtime_get_sync(ptdev->base.dev) < 0); ++ ++ /* Now, try to cleanly shutdown the GPU before the device resources ++ * get reclaimed. ++ */ ++ panthor_sched_unplug(ptdev); ++ panthor_fw_unplug(ptdev); ++ panthor_mmu_unplug(ptdev); ++ panthor_gpu_unplug(ptdev); ++ ++ pm_runtime_dont_use_autosuspend(ptdev->base.dev); ++ pm_runtime_put_sync_suspend(ptdev->base.dev); ++ ++ /* Report the unplug operation as done to unblock concurrent ++ * panthor_device_unplug() callers. ++ */ ++ complete_all(&ptdev->unplug.done); ++} ++ ++static void panthor_device_reset_cleanup(struct drm_device *ddev, void *data) ++{ ++ struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); ++ ++ cancel_work_sync(&ptdev->reset.work); ++ destroy_workqueue(ptdev->reset.wq); ++} ++ ++static void panthor_device_reset_work(struct work_struct *work) ++{ ++ struct panthor_device *ptdev = container_of(work, struct panthor_device, reset.work); ++ int ret = 0, cookie; ++ ++ if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_ACTIVE) { ++ /* ++ * No need for a reset as the device has been (or will be) ++ * powered down ++ */ ++ atomic_set(&ptdev->reset.pending, 0); ++ return; ++ } ++ ++ if (!drm_dev_enter(&ptdev->base, &cookie)) ++ return; ++ ++ panthor_sched_pre_reset(ptdev); ++ panthor_fw_pre_reset(ptdev, true); ++ panthor_mmu_pre_reset(ptdev); ++ panthor_gpu_soft_reset(ptdev); ++ panthor_gpu_l2_power_on(ptdev); ++ panthor_mmu_post_reset(ptdev); ++ ret = panthor_fw_post_reset(ptdev); ++ if (ret) ++ goto out_dev_exit; ++ ++ atomic_set(&ptdev->reset.pending, 0); ++ panthor_sched_post_reset(ptdev); ++ ++out_dev_exit: ++ drm_dev_exit(cookie); ++ ++ if (ret) { ++ panthor_device_unplug(ptdev); ++ drm_err(&ptdev->base, "Failed to boot MCU after reset, making device unusable."); ++ } ++} ++ ++static bool panthor_device_is_initialized(struct panthor_device *ptdev) ++{ ++ return !!ptdev->scheduler; ++} ++ ++static void panthor_device_free_page(struct drm_device *ddev, void *data) ++{ ++ free_page((unsigned long)data); ++} ++ ++int panthor_device_init(struct panthor_device *ptdev) ++{ ++ struct resource *res; ++ struct page *p; ++ int ret; ++ ++ ptdev->coherent = device_get_dma_attr(ptdev->base.dev) == DEV_DMA_COHERENT; ++ ++ init_completion(&ptdev->unplug.done); ++ ret = drmm_mutex_init(&ptdev->base, &ptdev->unplug.lock); ++ if (ret) ++ return ret; ++ ++ ret = drmm_mutex_init(&ptdev->base, &ptdev->pm.mmio_lock); ++ if (ret) ++ return ret; ++ ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED); ++ p = alloc_page(GFP_KERNEL | __GFP_ZERO); ++ if (!p) ++ return -ENOMEM; ++ ++ ptdev->pm.dummy_latest_flush = page_address(p); ++ ret = drmm_add_action_or_reset(&ptdev->base, panthor_device_free_page, ++ ptdev->pm.dummy_latest_flush); ++ if (ret) ++ return ret; ++ ++ /* ++ * Set the dummy page holding the latest flush to 1. This will cause the ++ * flush to avoided as we know it isn't necessary if the submission ++ * happens while the dummy page is mapped. Zero cannot be used because ++ * that means 'always flush'. ++ */ ++ *ptdev->pm.dummy_latest_flush = 1; ++ ++ INIT_WORK(&ptdev->reset.work, panthor_device_reset_work); ++ ptdev->reset.wq = alloc_ordered_workqueue("panthor-reset-wq", 0); ++ if (!ptdev->reset.wq) ++ return -ENOMEM; ++ ++ ret = drmm_add_action_or_reset(&ptdev->base, panthor_device_reset_cleanup, NULL); ++ if (ret) ++ return ret; ++ ++ ret = panthor_clk_init(ptdev); ++ if (ret) ++ return ret; ++ ++ ret = panthor_devfreq_init(ptdev); ++ if (ret) ++ return ret; ++ ++ ptdev->iomem = devm_platform_get_and_ioremap_resource(to_platform_device(ptdev->base.dev), ++ 0, &res); ++ if (IS_ERR(ptdev->iomem)) ++ return PTR_ERR(ptdev->iomem); ++ ++ ptdev->phys_addr = res->start; ++ ++ ret = devm_pm_runtime_enable(ptdev->base.dev); ++ if (ret) ++ return ret; ++ ++ ret = pm_runtime_resume_and_get(ptdev->base.dev); ++ if (ret) ++ return ret; ++ ++ ret = panthor_gpu_init(ptdev); ++ if (ret) ++ goto err_rpm_put; ++ ++ ret = panthor_mmu_init(ptdev); ++ if (ret) ++ goto err_unplug_gpu; ++ ++ ret = panthor_fw_init(ptdev); ++ if (ret) ++ goto err_unplug_mmu; ++ ++ ret = panthor_sched_init(ptdev); ++ if (ret) ++ goto err_unplug_fw; ++ ++ /* ~3 frames */ ++ pm_runtime_set_autosuspend_delay(ptdev->base.dev, 50); ++ pm_runtime_use_autosuspend(ptdev->base.dev); ++ ++ ret = drm_dev_register(&ptdev->base, 0); ++ if (ret) ++ goto err_disable_autosuspend; ++ ++ pm_runtime_put_autosuspend(ptdev->base.dev); ++ return 0; ++ ++err_disable_autosuspend: ++ pm_runtime_dont_use_autosuspend(ptdev->base.dev); ++ panthor_sched_unplug(ptdev); ++ ++err_unplug_fw: ++ panthor_fw_unplug(ptdev); ++ ++err_unplug_mmu: ++ panthor_mmu_unplug(ptdev); ++ ++err_unplug_gpu: ++ panthor_gpu_unplug(ptdev); ++ ++err_rpm_put: ++ pm_runtime_put_sync_suspend(ptdev->base.dev); ++ return ret; ++} ++ ++#define PANTHOR_EXCEPTION(id) \ ++ [DRM_PANTHOR_EXCEPTION_ ## id] = { \ ++ .name = #id, \ ++ } ++ ++struct panthor_exception_info { ++ const char *name; ++}; ++ ++static const struct panthor_exception_info panthor_exception_infos[] = { ++ PANTHOR_EXCEPTION(OK), ++ PANTHOR_EXCEPTION(TERMINATED), ++ PANTHOR_EXCEPTION(KABOOM), ++ PANTHOR_EXCEPTION(EUREKA), ++ PANTHOR_EXCEPTION(ACTIVE), ++ PANTHOR_EXCEPTION(CS_RES_TERM), ++ PANTHOR_EXCEPTION(CS_CONFIG_FAULT), ++ PANTHOR_EXCEPTION(CS_ENDPOINT_FAULT), ++ PANTHOR_EXCEPTION(CS_BUS_FAULT), ++ PANTHOR_EXCEPTION(CS_INSTR_INVALID), ++ PANTHOR_EXCEPTION(CS_CALL_STACK_OVERFLOW), ++ PANTHOR_EXCEPTION(CS_INHERIT_FAULT), ++ PANTHOR_EXCEPTION(INSTR_INVALID_PC), ++ PANTHOR_EXCEPTION(INSTR_INVALID_ENC), ++ PANTHOR_EXCEPTION(INSTR_BARRIER_FAULT), ++ PANTHOR_EXCEPTION(DATA_INVALID_FAULT), ++ PANTHOR_EXCEPTION(TILE_RANGE_FAULT), ++ PANTHOR_EXCEPTION(ADDR_RANGE_FAULT), ++ PANTHOR_EXCEPTION(IMPRECISE_FAULT), ++ PANTHOR_EXCEPTION(OOM), ++ PANTHOR_EXCEPTION(CSF_FW_INTERNAL_ERROR), ++ PANTHOR_EXCEPTION(CSF_RES_EVICTION_TIMEOUT), ++ PANTHOR_EXCEPTION(GPU_BUS_FAULT), ++ PANTHOR_EXCEPTION(GPU_SHAREABILITY_FAULT), ++ PANTHOR_EXCEPTION(SYS_SHAREABILITY_FAULT), ++ PANTHOR_EXCEPTION(GPU_CACHEABILITY_FAULT), ++ PANTHOR_EXCEPTION(TRANSLATION_FAULT_0), ++ PANTHOR_EXCEPTION(TRANSLATION_FAULT_1), ++ PANTHOR_EXCEPTION(TRANSLATION_FAULT_2), ++ PANTHOR_EXCEPTION(TRANSLATION_FAULT_3), ++ PANTHOR_EXCEPTION(TRANSLATION_FAULT_4), ++ PANTHOR_EXCEPTION(PERM_FAULT_0), ++ PANTHOR_EXCEPTION(PERM_FAULT_1), ++ PANTHOR_EXCEPTION(PERM_FAULT_2), ++ PANTHOR_EXCEPTION(PERM_FAULT_3), ++ PANTHOR_EXCEPTION(ACCESS_FLAG_1), ++ PANTHOR_EXCEPTION(ACCESS_FLAG_2), ++ PANTHOR_EXCEPTION(ACCESS_FLAG_3), ++ PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_IN), ++ PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT0), ++ PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT1), ++ PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT2), ++ PANTHOR_EXCEPTION(ADDR_SIZE_FAULT_OUT3), ++ PANTHOR_EXCEPTION(MEM_ATTR_FAULT_0), ++ PANTHOR_EXCEPTION(MEM_ATTR_FAULT_1), ++ PANTHOR_EXCEPTION(MEM_ATTR_FAULT_2), ++ PANTHOR_EXCEPTION(MEM_ATTR_FAULT_3), ++}; ++ ++const char *panthor_exception_name(struct panthor_device *ptdev, u32 exception_code) ++{ ++ if (exception_code >= ARRAY_SIZE(panthor_exception_infos) || ++ !panthor_exception_infos[exception_code].name) ++ return "Unknown exception type"; ++ ++ return panthor_exception_infos[exception_code].name; ++} ++ ++static vm_fault_t panthor_mmio_vm_fault(struct vm_fault *vmf) ++{ ++ struct vm_area_struct *vma = vmf->vma; ++ struct panthor_device *ptdev = vma->vm_private_data; ++ u64 id = (u64)vma->vm_pgoff << PAGE_SHIFT; ++ unsigned long pfn; ++ pgprot_t pgprot; ++ vm_fault_t ret; ++ bool active; ++ int cookie; ++ ++ if (!drm_dev_enter(&ptdev->base, &cookie)) ++ return VM_FAULT_SIGBUS; ++ ++ mutex_lock(&ptdev->pm.mmio_lock); ++ active = atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE; ++ ++ switch (panthor_device_mmio_offset(id)) { ++ case DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET: ++ if (active) ++ pfn = __phys_to_pfn(ptdev->phys_addr + CSF_GPU_LATEST_FLUSH_ID); ++ else ++ pfn = virt_to_pfn(ptdev->pm.dummy_latest_flush); ++ break; ++ ++ default: ++ ret = VM_FAULT_SIGBUS; ++ goto out_unlock; ++ } ++ ++ pgprot = vma->vm_page_prot; ++ if (active) ++ pgprot = pgprot_noncached(pgprot); ++ ++ ret = vmf_insert_pfn_prot(vma, vmf->address, pfn, pgprot); ++ ++out_unlock: ++ mutex_unlock(&ptdev->pm.mmio_lock); ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static const struct vm_operations_struct panthor_mmio_vm_ops = { ++ .fault = panthor_mmio_vm_fault, ++}; ++ ++int panthor_device_mmap_io(struct panthor_device *ptdev, struct vm_area_struct *vma) ++{ ++ u64 id = (u64)vma->vm_pgoff << PAGE_SHIFT; ++ ++ switch (panthor_device_mmio_offset(id)) { ++ case DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET: ++ if (vma->vm_end - vma->vm_start != PAGE_SIZE || ++ (vma->vm_flags & (VM_WRITE | VM_EXEC))) ++ return -EINVAL; ++ ++ break; ++ ++ default: ++ return -EINVAL; ++ } ++ ++ /* Defer actual mapping to the fault handler. */ ++ vma->vm_private_data = ptdev; ++ vma->vm_ops = &panthor_mmio_vm_ops; ++ vm_flags_set(vma, ++ VM_IO | VM_DONTCOPY | VM_DONTEXPAND | ++ VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP); ++ return 0; ++} ++ ++#ifdef CONFIG_PM ++int panthor_device_resume(struct device *dev) ++{ ++ struct panthor_device *ptdev = dev_get_drvdata(dev); ++ int ret, cookie; ++ ++ if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_SUSPENDED) ++ return -EINVAL; ++ ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_RESUMING); ++ ++ ret = clk_prepare_enable(ptdev->clks.core); ++ if (ret) ++ goto err_set_suspended; ++ ++ ret = clk_prepare_enable(ptdev->clks.stacks); ++ if (ret) ++ goto err_disable_core_clk; ++ ++ ret = clk_prepare_enable(ptdev->clks.coregroup); ++ if (ret) ++ goto err_disable_stacks_clk; ++ ++ ret = panthor_devfreq_resume(ptdev); ++ if (ret) ++ goto err_disable_coregroup_clk; ++ ++ if (panthor_device_is_initialized(ptdev) && ++ drm_dev_enter(&ptdev->base, &cookie)) { ++ panthor_gpu_resume(ptdev); ++ panthor_mmu_resume(ptdev); ++ ret = drm_WARN_ON(&ptdev->base, panthor_fw_resume(ptdev)); ++ if (!ret) { ++ panthor_sched_resume(ptdev); ++ } else { ++ panthor_mmu_suspend(ptdev); ++ panthor_gpu_suspend(ptdev); ++ } ++ ++ drm_dev_exit(cookie); ++ ++ if (ret) ++ goto err_suspend_devfreq; ++ } ++ ++ if (atomic_read(&ptdev->reset.pending)) ++ queue_work(ptdev->reset.wq, &ptdev->reset.work); ++ ++ /* Clear all IOMEM mappings pointing to this device after we've ++ * resumed. This way the fake mappings pointing to the dummy pages ++ * are removed and the real iomem mapping will be restored on next ++ * access. ++ */ ++ mutex_lock(&ptdev->pm.mmio_lock); ++ unmap_mapping_range(ptdev->base.anon_inode->i_mapping, ++ DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1); ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_ACTIVE); ++ mutex_unlock(&ptdev->pm.mmio_lock); ++ return 0; ++ ++err_suspend_devfreq: ++ panthor_devfreq_suspend(ptdev); ++ ++err_disable_coregroup_clk: ++ clk_disable_unprepare(ptdev->clks.coregroup); ++ ++err_disable_stacks_clk: ++ clk_disable_unprepare(ptdev->clks.stacks); ++ ++err_disable_core_clk: ++ clk_disable_unprepare(ptdev->clks.core); ++ ++err_set_suspended: ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED); ++ return ret; ++} ++ ++int panthor_device_suspend(struct device *dev) ++{ ++ struct panthor_device *ptdev = dev_get_drvdata(dev); ++ int ret, cookie; ++ ++ if (atomic_read(&ptdev->pm.state) != PANTHOR_DEVICE_PM_STATE_ACTIVE) ++ return -EINVAL; ++ ++ /* Clear all IOMEM mappings pointing to this device before we ++ * shutdown the power-domain and clocks. Failing to do that results ++ * in external aborts when the process accesses the iomem region. ++ * We change the state and call unmap_mapping_range() with the ++ * mmio_lock held to make sure the vm_fault handler won't set up ++ * invalid mappings. ++ */ ++ mutex_lock(&ptdev->pm.mmio_lock); ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDING); ++ unmap_mapping_range(ptdev->base.anon_inode->i_mapping, ++ DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1); ++ mutex_unlock(&ptdev->pm.mmio_lock); ++ ++ if (panthor_device_is_initialized(ptdev) && ++ drm_dev_enter(&ptdev->base, &cookie)) { ++ cancel_work_sync(&ptdev->reset.work); ++ ++ /* We prepare everything as if we were resetting the GPU. ++ * The end of the reset will happen in the resume path though. ++ */ ++ panthor_sched_suspend(ptdev); ++ panthor_fw_suspend(ptdev); ++ panthor_mmu_suspend(ptdev); ++ panthor_gpu_suspend(ptdev); ++ drm_dev_exit(cookie); ++ } ++ ++ ret = panthor_devfreq_suspend(ptdev); ++ if (ret) { ++ if (panthor_device_is_initialized(ptdev) && ++ drm_dev_enter(&ptdev->base, &cookie)) { ++ panthor_gpu_resume(ptdev); ++ panthor_mmu_resume(ptdev); ++ drm_WARN_ON(&ptdev->base, panthor_fw_resume(ptdev)); ++ panthor_sched_resume(ptdev); ++ drm_dev_exit(cookie); ++ } ++ ++ goto err_set_active; ++ } ++ ++ clk_disable_unprepare(ptdev->clks.coregroup); ++ clk_disable_unprepare(ptdev->clks.stacks); ++ clk_disable_unprepare(ptdev->clks.core); ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_SUSPENDED); ++ return 0; ++ ++err_set_active: ++ /* If something failed and we have to revert back to an ++ * active state, we also need to clear the MMIO userspace ++ * mappings, so any dumb pages that were mapped while we ++ * were trying to suspend gets invalidated. ++ */ ++ mutex_lock(&ptdev->pm.mmio_lock); ++ atomic_set(&ptdev->pm.state, PANTHOR_DEVICE_PM_STATE_ACTIVE); ++ unmap_mapping_range(ptdev->base.anon_inode->i_mapping, ++ DRM_PANTHOR_USER_MMIO_OFFSET, 0, 1); ++ mutex_unlock(&ptdev->pm.mmio_lock); ++ return ret; ++} ++#endif +diff --git a/drivers/gpu/drm/panthor/panthor_device.h b/drivers/gpu/drm/panthor/panthor_device.h +new file mode 100644 +index 000000000000..51c9d61b6796 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_device.h +@@ -0,0 +1,394 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2018 Marty E. Plummer */ ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifndef __PANTHOR_DEVICE_H__ ++#define __PANTHOR_DEVICE_H__ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++struct panthor_csf; ++struct panthor_csf_ctx; ++struct panthor_device; ++struct panthor_gpu; ++struct panthor_group_pool; ++struct panthor_heap_pool; ++struct panthor_job; ++struct panthor_mmu; ++struct panthor_fw; ++struct panthor_perfcnt; ++struct panthor_vm; ++struct panthor_vm_pool; ++ ++/** ++ * enum panthor_device_pm_state - PM state ++ */ ++enum panthor_device_pm_state { ++ /** @PANTHOR_DEVICE_PM_STATE_SUSPENDED: Device is suspended. */ ++ PANTHOR_DEVICE_PM_STATE_SUSPENDED = 0, ++ ++ /** @PANTHOR_DEVICE_PM_STATE_RESUMING: Device is being resumed. */ ++ PANTHOR_DEVICE_PM_STATE_RESUMING, ++ ++ /** @PANTHOR_DEVICE_PM_STATE_ACTIVE: Device is active. */ ++ PANTHOR_DEVICE_PM_STATE_ACTIVE, ++ ++ /** @PANTHOR_DEVICE_PM_STATE_SUSPENDING: Device is being suspended. */ ++ PANTHOR_DEVICE_PM_STATE_SUSPENDING, ++}; ++ ++/** ++ * struct panthor_irq - IRQ data ++ * ++ * Used to automate IRQ handling for the 3 different IRQs we have in this driver. ++ */ ++struct panthor_irq { ++ /** @ptdev: Panthor device */ ++ struct panthor_device *ptdev; ++ ++ /** @irq: IRQ number. */ ++ int irq; ++ ++ /** @mask: Current mask being applied to xxx_INT_MASK. */ ++ u32 mask; ++ ++ /** @suspended: Set to true when the IRQ is suspended. */ ++ atomic_t suspended; ++}; ++ ++/** ++ * struct panthor_device - Panthor device ++ */ ++struct panthor_device { ++ /** @base: Base drm_device. */ ++ struct drm_device base; ++ ++ /** @phys_addr: Physical address of the iomem region. */ ++ phys_addr_t phys_addr; ++ ++ /** @iomem: CPU mapping of the IOMEM region. */ ++ void __iomem *iomem; ++ ++ /** @clks: GPU clocks. */ ++ struct { ++ /** @core: Core clock. */ ++ struct clk *core; ++ ++ /** @stacks: Stacks clock. This clock is optional. */ ++ struct clk *stacks; ++ ++ /** @coregroup: Core group clock. This clock is optional. */ ++ struct clk *coregroup; ++ } clks; ++ ++ /** @coherent: True if the CPU/GPU are memory coherent. */ ++ bool coherent; ++ ++ /** @gpu_info: GPU information. */ ++ struct drm_panthor_gpu_info gpu_info; ++ ++ /** @csif_info: Command stream interface information. */ ++ struct drm_panthor_csif_info csif_info; ++ ++ /** @gpu: GPU management data. */ ++ struct panthor_gpu *gpu; ++ ++ /** @fw: FW management data. */ ++ struct panthor_fw *fw; ++ ++ /** @mmu: MMU management data. */ ++ struct panthor_mmu *mmu; ++ ++ /** @scheduler: Scheduler management data. */ ++ struct panthor_scheduler *scheduler; ++ ++ /** @devfreq: Device frequency scaling management data. */ ++ struct panthor_devfreq *devfreq; ++ ++ /** @unplug: Device unplug related fields. */ ++ struct { ++ /** @lock: Lock used to serialize unplug operations. */ ++ struct mutex lock; ++ ++ /** ++ * @done: Completion object signaled when the unplug ++ * operation is done. ++ */ ++ struct completion done; ++ } unplug; ++ ++ /** @reset: Reset related fields. */ ++ struct { ++ /** @wq: Ordered worqueud used to schedule reset operations. */ ++ struct workqueue_struct *wq; ++ ++ /** @work: Reset work. */ ++ struct work_struct work; ++ ++ /** @pending: Set to true if a reset is pending. */ ++ atomic_t pending; ++ } reset; ++ ++ /** @pm: Power management related data. */ ++ struct { ++ /** @state: Power state. */ ++ atomic_t state; ++ ++ /** ++ * @mmio_lock: Lock protecting MMIO userspace CPU mappings. ++ * ++ * This is needed to ensure we map the dummy IO pages when ++ * the device is being suspended, and the real IO pages when ++ * the device is being resumed. We can't just do with the ++ * state atomicity to deal with this race. ++ */ ++ struct mutex mmio_lock; ++ ++ /** ++ * @dummy_latest_flush: Dummy LATEST_FLUSH page. ++ * ++ * Used to replace the real LATEST_FLUSH page when the GPU ++ * is suspended. ++ */ ++ u32 *dummy_latest_flush; ++ } pm; ++}; ++ ++/** ++ * struct panthor_file - Panthor file ++ */ ++struct panthor_file { ++ /** @ptdev: Device attached to this file. */ ++ struct panthor_device *ptdev; ++ ++ /** @vms: VM pool attached to this file. */ ++ struct panthor_vm_pool *vms; ++ ++ /** @groups: Scheduling group pool attached to this file. */ ++ struct panthor_group_pool *groups; ++}; ++ ++int panthor_device_init(struct panthor_device *ptdev); ++void panthor_device_unplug(struct panthor_device *ptdev); ++ ++/** ++ * panthor_device_schedule_reset() - Schedules a reset operation ++ */ ++static inline void panthor_device_schedule_reset(struct panthor_device *ptdev) ++{ ++ if (!atomic_cmpxchg(&ptdev->reset.pending, 0, 1) && ++ atomic_read(&ptdev->pm.state) == PANTHOR_DEVICE_PM_STATE_ACTIVE) ++ queue_work(ptdev->reset.wq, &ptdev->reset.work); ++} ++ ++/** ++ * panthor_device_reset_is_pending() - Checks if a reset is pending. ++ * ++ * Return: true if a reset is pending, false otherwise. ++ */ ++static inline bool panthor_device_reset_is_pending(struct panthor_device *ptdev) ++{ ++ return atomic_read(&ptdev->reset.pending) != 0; ++} ++ ++int panthor_device_mmap_io(struct panthor_device *ptdev, ++ struct vm_area_struct *vma); ++ ++int panthor_device_resume(struct device *dev); ++int panthor_device_suspend(struct device *dev); ++ ++enum drm_panthor_exception_type { ++ DRM_PANTHOR_EXCEPTION_OK = 0x00, ++ DRM_PANTHOR_EXCEPTION_TERMINATED = 0x04, ++ DRM_PANTHOR_EXCEPTION_KABOOM = 0x05, ++ DRM_PANTHOR_EXCEPTION_EUREKA = 0x06, ++ DRM_PANTHOR_EXCEPTION_ACTIVE = 0x08, ++ DRM_PANTHOR_EXCEPTION_CS_RES_TERM = 0x0f, ++ DRM_PANTHOR_EXCEPTION_MAX_NON_FAULT = 0x3f, ++ DRM_PANTHOR_EXCEPTION_CS_CONFIG_FAULT = 0x40, ++ DRM_PANTHOR_EXCEPTION_CS_ENDPOINT_FAULT = 0x44, ++ DRM_PANTHOR_EXCEPTION_CS_BUS_FAULT = 0x48, ++ DRM_PANTHOR_EXCEPTION_CS_INSTR_INVALID = 0x49, ++ DRM_PANTHOR_EXCEPTION_CS_CALL_STACK_OVERFLOW = 0x4a, ++ DRM_PANTHOR_EXCEPTION_CS_INHERIT_FAULT = 0x4b, ++ DRM_PANTHOR_EXCEPTION_INSTR_INVALID_PC = 0x50, ++ DRM_PANTHOR_EXCEPTION_INSTR_INVALID_ENC = 0x51, ++ DRM_PANTHOR_EXCEPTION_INSTR_BARRIER_FAULT = 0x55, ++ DRM_PANTHOR_EXCEPTION_DATA_INVALID_FAULT = 0x58, ++ DRM_PANTHOR_EXCEPTION_TILE_RANGE_FAULT = 0x59, ++ DRM_PANTHOR_EXCEPTION_ADDR_RANGE_FAULT = 0x5a, ++ DRM_PANTHOR_EXCEPTION_IMPRECISE_FAULT = 0x5b, ++ DRM_PANTHOR_EXCEPTION_OOM = 0x60, ++ DRM_PANTHOR_EXCEPTION_CSF_FW_INTERNAL_ERROR = 0x68, ++ DRM_PANTHOR_EXCEPTION_CSF_RES_EVICTION_TIMEOUT = 0x69, ++ DRM_PANTHOR_EXCEPTION_GPU_BUS_FAULT = 0x80, ++ DRM_PANTHOR_EXCEPTION_GPU_SHAREABILITY_FAULT = 0x88, ++ DRM_PANTHOR_EXCEPTION_SYS_SHAREABILITY_FAULT = 0x89, ++ DRM_PANTHOR_EXCEPTION_GPU_CACHEABILITY_FAULT = 0x8a, ++ DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_0 = 0xc0, ++ DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_1 = 0xc1, ++ DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_2 = 0xc2, ++ DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_3 = 0xc3, ++ DRM_PANTHOR_EXCEPTION_TRANSLATION_FAULT_4 = 0xc4, ++ DRM_PANTHOR_EXCEPTION_PERM_FAULT_0 = 0xc8, ++ DRM_PANTHOR_EXCEPTION_PERM_FAULT_1 = 0xc9, ++ DRM_PANTHOR_EXCEPTION_PERM_FAULT_2 = 0xca, ++ DRM_PANTHOR_EXCEPTION_PERM_FAULT_3 = 0xcb, ++ DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_1 = 0xd9, ++ DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_2 = 0xda, ++ DRM_PANTHOR_EXCEPTION_ACCESS_FLAG_3 = 0xdb, ++ DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_IN = 0xe0, ++ DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT0 = 0xe4, ++ DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT1 = 0xe5, ++ DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT2 = 0xe6, ++ DRM_PANTHOR_EXCEPTION_ADDR_SIZE_FAULT_OUT3 = 0xe7, ++ DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_0 = 0xe8, ++ DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_1 = 0xe9, ++ DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_2 = 0xea, ++ DRM_PANTHOR_EXCEPTION_MEM_ATTR_FAULT_3 = 0xeb, ++}; ++ ++/** ++ * panthor_exception_is_fault() - Checks if an exception is a fault. ++ * ++ * Return: true if the exception is a fault, false otherwise. ++ */ ++static inline bool ++panthor_exception_is_fault(u32 exception_code) ++{ ++ return exception_code > DRM_PANTHOR_EXCEPTION_MAX_NON_FAULT; ++} ++ ++const char *panthor_exception_name(struct panthor_device *ptdev, ++ u32 exception_code); ++ ++/** ++ * PANTHOR_IRQ_HANDLER() - Define interrupt handlers and the interrupt ++ * registration function. ++ * ++ * The boiler-plate to gracefully deal with shared interrupts is ++ * auto-generated. All you have to do is call PANTHOR_IRQ_HANDLER() ++ * just after the actual handler. The handler prototype is: ++ * ++ * void (*handler)(struct panthor_device *, u32 status); ++ */ ++#define PANTHOR_IRQ_HANDLER(__name, __reg_prefix, __handler) \ ++static irqreturn_t panthor_ ## __name ## _irq_raw_handler(int irq, void *data) \ ++{ \ ++ struct panthor_irq *pirq = data; \ ++ struct panthor_device *ptdev = pirq->ptdev; \ ++ \ ++ if (atomic_read(&pirq->suspended)) \ ++ return IRQ_NONE; \ ++ if (!gpu_read(ptdev, __reg_prefix ## _INT_STAT)) \ ++ return IRQ_NONE; \ ++ \ ++ gpu_write(ptdev, __reg_prefix ## _INT_MASK, 0); \ ++ return IRQ_WAKE_THREAD; \ ++} \ ++ \ ++static irqreturn_t panthor_ ## __name ## _irq_threaded_handler(int irq, void *data) \ ++{ \ ++ struct panthor_irq *pirq = data; \ ++ struct panthor_device *ptdev = pirq->ptdev; \ ++ irqreturn_t ret = IRQ_NONE; \ ++ \ ++ while (true) { \ ++ u32 status = gpu_read(ptdev, __reg_prefix ## _INT_RAWSTAT) & pirq->mask; \ ++ \ ++ if (!status) \ ++ break; \ ++ \ ++ gpu_write(ptdev, __reg_prefix ## _INT_CLEAR, status); \ ++ \ ++ __handler(ptdev, status); \ ++ ret = IRQ_HANDLED; \ ++ } \ ++ \ ++ if (!atomic_read(&pirq->suspended)) \ ++ gpu_write(ptdev, __reg_prefix ## _INT_MASK, pirq->mask); \ ++ \ ++ return ret; \ ++} \ ++ \ ++static inline void panthor_ ## __name ## _irq_suspend(struct panthor_irq *pirq) \ ++{ \ ++ int cookie; \ ++ \ ++ atomic_set(&pirq->suspended, true); \ ++ \ ++ if (drm_dev_enter(&pirq->ptdev->base, &cookie)) { \ ++ gpu_write(pirq->ptdev, __reg_prefix ## _INT_MASK, 0); \ ++ synchronize_irq(pirq->irq); \ ++ drm_dev_exit(cookie); \ ++ } \ ++ \ ++ pirq->mask = 0; \ ++} \ ++ \ ++static inline void panthor_ ## __name ## _irq_resume(struct panthor_irq *pirq, u32 mask) \ ++{ \ ++ int cookie; \ ++ \ ++ atomic_set(&pirq->suspended, false); \ ++ pirq->mask = mask; \ ++ \ ++ if (drm_dev_enter(&pirq->ptdev->base, &cookie)) { \ ++ gpu_write(pirq->ptdev, __reg_prefix ## _INT_CLEAR, mask); \ ++ gpu_write(pirq->ptdev, __reg_prefix ## _INT_MASK, mask); \ ++ drm_dev_exit(cookie); \ ++ } \ ++} \ ++ \ ++static int panthor_request_ ## __name ## _irq(struct panthor_device *ptdev, \ ++ struct panthor_irq *pirq, \ ++ int irq, u32 mask) \ ++{ \ ++ pirq->ptdev = ptdev; \ ++ pirq->irq = irq; \ ++ panthor_ ## __name ## _irq_resume(pirq, mask); \ ++ \ ++ return devm_request_threaded_irq(ptdev->base.dev, irq, \ ++ panthor_ ## __name ## _irq_raw_handler, \ ++ panthor_ ## __name ## _irq_threaded_handler, \ ++ IRQF_SHARED, KBUILD_MODNAME "-" # __name, \ ++ pirq); \ ++} ++ ++/** ++ * panthor_device_mmio_offset() - Turn a user MMIO offset into a kernel one ++ * @offset: Offset to convert. ++ * ++ * With 32-bit systems being limited by the 32-bit representation of mmap2's ++ * pgoffset field, we need to make the MMIO offset arch specific. This function ++ * converts a user MMIO offset into something the kernel driver understands. ++ * ++ * If the kernel and userspace architecture match, the offset is unchanged. If ++ * the kernel is 64-bit and userspace is 32-bit, the offset is adjusted to match ++ * 64-bit offsets. 32-bit kernel with 64-bit userspace is impossible. ++ * ++ * Return: Adjusted offset. ++ */ ++static inline u64 panthor_device_mmio_offset(u64 offset) ++{ ++#ifdef CONFIG_ARM64 ++ if (test_tsk_thread_flag(current, TIF_32BIT)) ++ offset += DRM_PANTHOR_USER_MMIO_OFFSET_64BIT - DRM_PANTHOR_USER_MMIO_OFFSET_32BIT; ++#endif ++ ++ return offset; ++} ++ ++extern struct workqueue_struct *panthor_cleanup_wq; ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:18 +0100 +Subject: drm/panthor: Add the GPU logical block + +Handles everything that's not related to the FW, the MMU or the +scheduler. This is the block dealing with the GPU property retrieval, +the GPU block power on/off logic, and some global operations, like +global cache flushing. + +v6: +- Add Maxime's and Heiko's acks + +v5: +- Fix GPU_MODEL() kernel doc +- Fix test in panthor_gpu_block_power_off() +- Add Steve's R-b + +v4: +- Expose CORE_FEATURES through DEV_QUERY + +v3: +- Add acks for the MIT/GPL2 relicensing +- Use macros to extract GPU ID info +- Make sure we reset clear pending_reqs bits when wait_event_timeout() + times out but the corresponding bit is cleared in GPU_INT_RAWSTAT + (can happen if the IRQ is masked or HW takes to long to call the IRQ + handler) +- GPU_MODEL now takes separate arch and product majors to be more + readable. +- Drop GPU_IRQ_MCU_STATUS_CHANGED from interrupt mask. +- Handle GPU_IRQ_PROTM_FAULT correctly (don't output registers that are + not updated for protected interrupts). +- Minor code tidy ups + +Cc: Alexey Sheplyakov # MIT+GPL2 relicensing +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_gpu.c | 482 ++++++++++ + drivers/gpu/drm/panthor/panthor_gpu.h | 52 + + 2 files changed, 534 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c +new file mode 100644 +index 000000000000..6dbbc4cfbe7e +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_gpu.c +@@ -0,0 +1,482 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2018 Marty E. Plummer */ ++/* Copyright 2019 Linaro, Ltd., Rob Herring */ ++/* Copyright 2019 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "panthor_device.h" ++#include "panthor_gpu.h" ++#include "panthor_regs.h" ++ ++/** ++ * struct panthor_gpu - GPU block management data. ++ */ ++struct panthor_gpu { ++ /** @irq: GPU irq. */ ++ struct panthor_irq irq; ++ ++ /** @reqs_lock: Lock protecting access to pending_reqs. */ ++ spinlock_t reqs_lock; ++ ++ /** @pending_reqs: Pending GPU requests. */ ++ u32 pending_reqs; ++ ++ /** @reqs_acked: GPU request wait queue. */ ++ wait_queue_head_t reqs_acked; ++}; ++ ++/** ++ * struct panthor_model - GPU model description ++ */ ++struct panthor_model { ++ /** @name: Model name. */ ++ const char *name; ++ ++ /** @arch_major: Major version number of architecture. */ ++ u8 arch_major; ++ ++ /** @product_major: Major version number of product. */ ++ u8 product_major; ++}; ++ ++/** ++ * GPU_MODEL() - Define a GPU model. A GPU product can be uniquely identified ++ * by a combination of the major architecture version and the major product ++ * version. ++ * @_name: Name for the GPU model. ++ * @_arch_major: Architecture major. ++ * @_product_major: Product major. ++ */ ++#define GPU_MODEL(_name, _arch_major, _product_major) \ ++{\ ++ .name = __stringify(_name), \ ++ .arch_major = _arch_major, \ ++ .product_major = _product_major, \ ++} ++ ++static const struct panthor_model gpu_models[] = { ++ GPU_MODEL(g610, 10, 7), ++ {}, ++}; ++ ++#define GPU_INTERRUPTS_MASK \ ++ (GPU_IRQ_FAULT | \ ++ GPU_IRQ_PROTM_FAULT | \ ++ GPU_IRQ_RESET_COMPLETED | \ ++ GPU_IRQ_CLEAN_CACHES_COMPLETED) ++ ++static void panthor_gpu_init_info(struct panthor_device *ptdev) ++{ ++ const struct panthor_model *model; ++ u32 arch_major, product_major; ++ u32 major, minor, status; ++ unsigned int i; ++ ++ ptdev->gpu_info.gpu_id = gpu_read(ptdev, GPU_ID); ++ ptdev->gpu_info.csf_id = gpu_read(ptdev, GPU_CSF_ID); ++ ptdev->gpu_info.gpu_rev = gpu_read(ptdev, GPU_REVID); ++ ptdev->gpu_info.core_features = gpu_read(ptdev, GPU_CORE_FEATURES); ++ ptdev->gpu_info.l2_features = gpu_read(ptdev, GPU_L2_FEATURES); ++ ptdev->gpu_info.tiler_features = gpu_read(ptdev, GPU_TILER_FEATURES); ++ ptdev->gpu_info.mem_features = gpu_read(ptdev, GPU_MEM_FEATURES); ++ ptdev->gpu_info.mmu_features = gpu_read(ptdev, GPU_MMU_FEATURES); ++ ptdev->gpu_info.thread_features = gpu_read(ptdev, GPU_THREAD_FEATURES); ++ ptdev->gpu_info.max_threads = gpu_read(ptdev, GPU_THREAD_MAX_THREADS); ++ ptdev->gpu_info.thread_max_workgroup_size = gpu_read(ptdev, GPU_THREAD_MAX_WORKGROUP_SIZE); ++ ptdev->gpu_info.thread_max_barrier_size = gpu_read(ptdev, GPU_THREAD_MAX_BARRIER_SIZE); ++ ptdev->gpu_info.coherency_features = gpu_read(ptdev, GPU_COHERENCY_FEATURES); ++ for (i = 0; i < 4; i++) ++ ptdev->gpu_info.texture_features[i] = gpu_read(ptdev, GPU_TEXTURE_FEATURES(i)); ++ ++ ptdev->gpu_info.as_present = gpu_read(ptdev, GPU_AS_PRESENT); ++ ++ ptdev->gpu_info.shader_present = gpu_read(ptdev, GPU_SHADER_PRESENT_LO); ++ ptdev->gpu_info.shader_present |= (u64)gpu_read(ptdev, GPU_SHADER_PRESENT_HI) << 32; ++ ++ ptdev->gpu_info.tiler_present = gpu_read(ptdev, GPU_TILER_PRESENT_LO); ++ ptdev->gpu_info.tiler_present |= (u64)gpu_read(ptdev, GPU_TILER_PRESENT_HI) << 32; ++ ++ ptdev->gpu_info.l2_present = gpu_read(ptdev, GPU_L2_PRESENT_LO); ++ ptdev->gpu_info.l2_present |= (u64)gpu_read(ptdev, GPU_L2_PRESENT_HI) << 32; ++ ++ arch_major = GPU_ARCH_MAJOR(ptdev->gpu_info.gpu_id); ++ product_major = GPU_PROD_MAJOR(ptdev->gpu_info.gpu_id); ++ major = GPU_VER_MAJOR(ptdev->gpu_info.gpu_id); ++ minor = GPU_VER_MINOR(ptdev->gpu_info.gpu_id); ++ status = GPU_VER_STATUS(ptdev->gpu_info.gpu_id); ++ ++ for (model = gpu_models; model->name; model++) { ++ if (model->arch_major == arch_major && ++ model->product_major == product_major) ++ break; ++ } ++ ++ drm_info(&ptdev->base, ++ "mali-%s id 0x%x major 0x%x minor 0x%x status 0x%x", ++ model->name ?: "unknown", ptdev->gpu_info.gpu_id >> 16, ++ major, minor, status); ++ ++ drm_info(&ptdev->base, ++ "Features: L2:%#x Tiler:%#x Mem:%#x MMU:%#x AS:%#x", ++ ptdev->gpu_info.l2_features, ++ ptdev->gpu_info.tiler_features, ++ ptdev->gpu_info.mem_features, ++ ptdev->gpu_info.mmu_features, ++ ptdev->gpu_info.as_present); ++ ++ drm_info(&ptdev->base, ++ "shader_present=0x%0llx l2_present=0x%0llx tiler_present=0x%0llx", ++ ptdev->gpu_info.shader_present, ptdev->gpu_info.l2_present, ++ ptdev->gpu_info.tiler_present); ++} ++ ++static void panthor_gpu_irq_handler(struct panthor_device *ptdev, u32 status) ++{ ++ if (status & GPU_IRQ_FAULT) { ++ u32 fault_status = gpu_read(ptdev, GPU_FAULT_STATUS); ++ u64 address = ((u64)gpu_read(ptdev, GPU_FAULT_ADDR_HI) << 32) | ++ gpu_read(ptdev, GPU_FAULT_ADDR_LO); ++ ++ drm_warn(&ptdev->base, "GPU Fault 0x%08x (%s) at 0x%016llx\n", ++ fault_status, panthor_exception_name(ptdev, fault_status & 0xFF), ++ address); ++ } ++ if (status & GPU_IRQ_PROTM_FAULT) ++ drm_warn(&ptdev->base, "GPU Fault in protected mode\n"); ++ ++ spin_lock(&ptdev->gpu->reqs_lock); ++ if (status & ptdev->gpu->pending_reqs) { ++ ptdev->gpu->pending_reqs &= ~status; ++ wake_up_all(&ptdev->gpu->reqs_acked); ++ } ++ spin_unlock(&ptdev->gpu->reqs_lock); ++} ++PANTHOR_IRQ_HANDLER(gpu, GPU, panthor_gpu_irq_handler); ++ ++/** ++ * panthor_gpu_unplug() - Called when the GPU is unplugged. ++ * @ptdev: Device to unplug. ++ */ ++void panthor_gpu_unplug(struct panthor_device *ptdev) ++{ ++ unsigned long flags; ++ ++ /* Make sure the IRQ handler is not running after that point. */ ++ panthor_gpu_irq_suspend(&ptdev->gpu->irq); ++ ++ /* Wake-up all waiters. */ ++ spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags); ++ ptdev->gpu->pending_reqs = 0; ++ wake_up_all(&ptdev->gpu->reqs_acked); ++ spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags); ++} ++ ++/** ++ * panthor_gpu_init() - Initialize the GPU block ++ * @ptdev: Device. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_gpu_init(struct panthor_device *ptdev) ++{ ++ struct panthor_gpu *gpu; ++ u32 pa_bits; ++ int ret, irq; ++ ++ gpu = drmm_kzalloc(&ptdev->base, sizeof(*gpu), GFP_KERNEL); ++ if (!gpu) ++ return -ENOMEM; ++ ++ spin_lock_init(&gpu->reqs_lock); ++ init_waitqueue_head(&gpu->reqs_acked); ++ ptdev->gpu = gpu; ++ panthor_gpu_init_info(ptdev); ++ ++ dma_set_max_seg_size(ptdev->base.dev, UINT_MAX); ++ pa_bits = GPU_MMU_FEATURES_PA_BITS(ptdev->gpu_info.mmu_features); ++ ret = dma_set_mask_and_coherent(ptdev->base.dev, DMA_BIT_MASK(pa_bits)); ++ if (ret) ++ return ret; ++ ++ irq = platform_get_irq_byname(to_platform_device(ptdev->base.dev), "gpu"); ++ if (irq <= 0) ++ return ret; ++ ++ ret = panthor_request_gpu_irq(ptdev, &ptdev->gpu->irq, irq, GPU_INTERRUPTS_MASK); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++/** ++ * panthor_gpu_block_power_off() - Power-off a specific block of the GPU ++ * @ptdev: Device. ++ * @blk_name: Block name. ++ * @pwroff_reg: Power-off register for this block. ++ * @pwrtrans_reg: Power transition register for this block. ++ * @mask: Sub-elements to power-off. ++ * @timeout_us: Timeout in microseconds. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_gpu_block_power_off(struct panthor_device *ptdev, ++ const char *blk_name, ++ u32 pwroff_reg, u32 pwrtrans_reg, ++ u64 mask, u32 timeout_us) ++{ ++ u32 val, i; ++ int ret; ++ ++ for (i = 0; i < 2; i++) { ++ u32 mask32 = mask >> (i * 32); ++ ++ if (!mask32) ++ continue; ++ ++ ret = readl_relaxed_poll_timeout(ptdev->iomem + pwrtrans_reg + (i * 4), ++ val, !(mask32 & val), ++ 100, timeout_us); ++ if (ret) { ++ drm_err(&ptdev->base, "timeout waiting on %s:%llx power transition", ++ blk_name, mask); ++ return ret; ++ } ++ } ++ ++ if (mask & GENMASK(31, 0)) ++ gpu_write(ptdev, pwroff_reg, mask); ++ ++ if (mask >> 32) ++ gpu_write(ptdev, pwroff_reg + 4, mask >> 32); ++ ++ for (i = 0; i < 2; i++) { ++ u32 mask32 = mask >> (i * 32); ++ ++ if (!mask32) ++ continue; ++ ++ ret = readl_relaxed_poll_timeout(ptdev->iomem + pwrtrans_reg + (i * 4), ++ val, !(mask32 & val), ++ 100, timeout_us); ++ if (ret) { ++ drm_err(&ptdev->base, "timeout waiting on %s:%llx power transition", ++ blk_name, mask); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_gpu_block_power_on() - Power-on a specific block of the GPU ++ * @ptdev: Device. ++ * @blk_name: Block name. ++ * @pwron_reg: Power-on register for this block. ++ * @pwrtrans_reg: Power transition register for this block. ++ * @rdy_reg: Power transition ready register. ++ * @mask: Sub-elements to power-on. ++ * @timeout_us: Timeout in microseconds. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_gpu_block_power_on(struct panthor_device *ptdev, ++ const char *blk_name, ++ u32 pwron_reg, u32 pwrtrans_reg, ++ u32 rdy_reg, u64 mask, u32 timeout_us) ++{ ++ u32 val, i; ++ int ret; ++ ++ for (i = 0; i < 2; i++) { ++ u32 mask32 = mask >> (i * 32); ++ ++ if (!mask32) ++ continue; ++ ++ ret = readl_relaxed_poll_timeout(ptdev->iomem + pwrtrans_reg + (i * 4), ++ val, !(mask32 & val), ++ 100, timeout_us); ++ if (ret) { ++ drm_err(&ptdev->base, "timeout waiting on %s:%llx power transition", ++ blk_name, mask); ++ return ret; ++ } ++ } ++ ++ if (mask & GENMASK(31, 0)) ++ gpu_write(ptdev, pwron_reg, mask); ++ ++ if (mask >> 32) ++ gpu_write(ptdev, pwron_reg + 4, mask >> 32); ++ ++ for (i = 0; i < 2; i++) { ++ u32 mask32 = mask >> (i * 32); ++ ++ if (!mask32) ++ continue; ++ ++ ret = readl_relaxed_poll_timeout(ptdev->iomem + rdy_reg + (i * 4), ++ val, (mask32 & val) == mask32, ++ 100, timeout_us); ++ if (ret) { ++ drm_err(&ptdev->base, "timeout waiting on %s:%llx readyness", ++ blk_name, mask); ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_gpu_l2_power_on() - Power-on the L2-cache ++ * @ptdev: Device. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_gpu_l2_power_on(struct panthor_device *ptdev) ++{ ++ if (ptdev->gpu_info.l2_present != 1) { ++ /* ++ * Only support one core group now. ++ * ~(l2_present - 1) unsets all bits in l2_present except ++ * the bottom bit. (l2_present - 2) has all the bits in ++ * the first core group set. AND them together to generate ++ * a mask of cores in the first core group. ++ */ ++ u64 core_mask = ~(ptdev->gpu_info.l2_present - 1) & ++ (ptdev->gpu_info.l2_present - 2); ++ drm_info_once(&ptdev->base, "using only 1st core group (%lu cores from %lu)\n", ++ hweight64(core_mask), ++ hweight64(ptdev->gpu_info.shader_present)); ++ } ++ ++ return panthor_gpu_power_on(ptdev, L2, 1, 20000); ++} ++ ++/** ++ * panthor_gpu_flush_caches() - Flush caches ++ * @ptdev: Device. ++ * @l2: L2 flush type. ++ * @lsc: LSC flush type. ++ * @other: Other flush type. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_gpu_flush_caches(struct panthor_device *ptdev, ++ u32 l2, u32 lsc, u32 other) ++{ ++ bool timedout = false; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags); ++ if (!drm_WARN_ON(&ptdev->base, ++ ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) { ++ ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED; ++ gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other)); ++ } ++ spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags); ++ ++ if (!wait_event_timeout(ptdev->gpu->reqs_acked, ++ !(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED), ++ msecs_to_jiffies(100))) { ++ spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags); ++ if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 && ++ !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED)) ++ timedout = true; ++ else ++ ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED; ++ spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags); ++ } ++ ++ if (timedout) { ++ drm_err(&ptdev->base, "Flush caches timeout"); ++ return -ETIMEDOUT; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_gpu_soft_reset() - Issue a soft-reset ++ * @ptdev: Device. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_gpu_soft_reset(struct panthor_device *ptdev) ++{ ++ bool timedout = false; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags); ++ if (!drm_WARN_ON(&ptdev->base, ++ ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED)) { ++ ptdev->gpu->pending_reqs |= GPU_IRQ_RESET_COMPLETED; ++ gpu_write(ptdev, GPU_INT_CLEAR, GPU_IRQ_RESET_COMPLETED); ++ gpu_write(ptdev, GPU_CMD, GPU_SOFT_RESET); ++ } ++ spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags); ++ ++ if (!wait_event_timeout(ptdev->gpu->reqs_acked, ++ !(ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED), ++ msecs_to_jiffies(100))) { ++ spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags); ++ if ((ptdev->gpu->pending_reqs & GPU_IRQ_RESET_COMPLETED) != 0 && ++ !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_RESET_COMPLETED)) ++ timedout = true; ++ else ++ ptdev->gpu->pending_reqs &= ~GPU_IRQ_RESET_COMPLETED; ++ spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags); ++ } ++ ++ if (timedout) { ++ drm_err(&ptdev->base, "Soft reset timeout"); ++ return -ETIMEDOUT; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_gpu_suspend() - Suspend the GPU block. ++ * @ptdev: Device. ++ * ++ * Suspend the GPU irq. This should be called last in the suspend procedure, ++ * after all other blocks have been suspented. ++ */ ++void panthor_gpu_suspend(struct panthor_device *ptdev) ++{ ++ /* ++ * It may be preferable to simply power down the L2, but for now just ++ * soft-reset which will leave the L2 powered down. ++ */ ++ panthor_gpu_soft_reset(ptdev); ++ panthor_gpu_irq_suspend(&ptdev->gpu->irq); ++} ++ ++/** ++ * panthor_gpu_resume() - Resume the GPU block. ++ * @ptdev: Device. ++ * ++ * Resume the IRQ handler and power-on the L2-cache. ++ * The FW takes care of powering the other blocks. ++ */ ++void panthor_gpu_resume(struct panthor_device *ptdev) ++{ ++ panthor_gpu_irq_resume(&ptdev->gpu->irq, GPU_INTERRUPTS_MASK); ++ panthor_gpu_l2_power_on(ptdev); ++} +diff --git a/drivers/gpu/drm/panthor/panthor_gpu.h b/drivers/gpu/drm/panthor/panthor_gpu.h +new file mode 100644 +index 000000000000..bba7555dd3c6 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_gpu.h +@@ -0,0 +1,52 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2018 Marty E. Plummer */ ++/* Copyright 2019 Collabora ltd. */ ++ ++#ifndef __PANTHOR_GPU_H__ ++#define __PANTHOR_GPU_H__ ++ ++struct panthor_device; ++ ++int panthor_gpu_init(struct panthor_device *ptdev); ++void panthor_gpu_unplug(struct panthor_device *ptdev); ++void panthor_gpu_suspend(struct panthor_device *ptdev); ++void panthor_gpu_resume(struct panthor_device *ptdev); ++ ++int panthor_gpu_block_power_on(struct panthor_device *ptdev, ++ const char *blk_name, ++ u32 pwron_reg, u32 pwrtrans_reg, ++ u32 rdy_reg, u64 mask, u32 timeout_us); ++int panthor_gpu_block_power_off(struct panthor_device *ptdev, ++ const char *blk_name, ++ u32 pwroff_reg, u32 pwrtrans_reg, ++ u64 mask, u32 timeout_us); ++ ++/** ++ * panthor_gpu_power_on() - Power on the GPU block. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++#define panthor_gpu_power_on(ptdev, type, mask, timeout_us) \ ++ panthor_gpu_block_power_on(ptdev, #type, \ ++ type ## _PWRON_LO, \ ++ type ## _PWRTRANS_LO, \ ++ type ## _READY_LO, \ ++ mask, timeout_us) ++ ++/** ++ * panthor_gpu_power_off() - Power off the GPU block. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++#define panthor_gpu_power_off(ptdev, type, mask, timeout_us) \ ++ panthor_gpu_block_power_off(ptdev, #type, \ ++ type ## _PWROFF_LO, \ ++ type ## _PWRTRANS_LO, \ ++ mask, timeout_us) ++ ++int panthor_gpu_l2_power_on(struct panthor_device *ptdev); ++int panthor_gpu_flush_caches(struct panthor_device *ptdev, ++ u32 l2, u32 lsc, u32 other); ++int panthor_gpu_soft_reset(struct panthor_device *ptdev); ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:19 +0100 +Subject: drm/panthor: Add GEM logical block + +Anything relating to GEM object management is placed here. Nothing +particularly interesting here, given the implementation is based on +drm_gem_shmem_object, which is doing most of the work. + +v6: +- Add Maxime's and Heiko's acks +- Return a page-aligned BO size to userspace when creating a BO +- Keep header inclusion alphabetically ordered + +v5: +- Add Liviu's and Steve's R-b + +v4: +- Force kernel BOs to be GPU mapped +- Make panthor_kernel_bo_destroy() robust against ERR/NULL BO pointers + to simplify the call sites + +v3: +- Add acks for the MIT/GPL2 relicensing +- Provide a panthor_kernel_bo abstraction for buffer objects managed by + the kernel (will replace panthor_fw_mem and be used everywhere we were + using panthor_gem_create_and_map() before) +- Adjust things to match drm_gpuvm changes +- Change return of panthor_gem_create_with_handle() to int + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Liviu Dudau +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_gem.c | 230 ++++++++++ + drivers/gpu/drm/panthor/panthor_gem.h | 142 ++++++ + 2 files changed, 372 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_gem.c b/drivers/gpu/drm/panthor/panthor_gem.c +new file mode 100644 +index 000000000000..d6483266d0c2 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_gem.c +@@ -0,0 +1,230 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "panthor_device.h" ++#include "panthor_gem.h" ++#include "panthor_mmu.h" ++ ++static void panthor_gem_free_object(struct drm_gem_object *obj) ++{ ++ struct panthor_gem_object *bo = to_panthor_bo(obj); ++ struct drm_gem_object *vm_root_gem = bo->exclusive_vm_root_gem; ++ ++ drm_gem_free_mmap_offset(&bo->base.base); ++ mutex_destroy(&bo->gpuva_list_lock); ++ drm_gem_shmem_free(&bo->base); ++ drm_gem_object_put(vm_root_gem); ++} ++ ++/** ++ * panthor_kernel_bo_destroy() - Destroy a kernel buffer object ++ * @vm: The VM this BO was mapped to. ++ * @bo: Kernel buffer object to destroy. If NULL or an ERR_PTR(), the destruction ++ * is skipped. ++ */ ++void panthor_kernel_bo_destroy(struct panthor_vm *vm, ++ struct panthor_kernel_bo *bo) ++{ ++ int ret; ++ ++ if (IS_ERR_OR_NULL(bo)) ++ return; ++ ++ panthor_kernel_bo_vunmap(bo); ++ ++ if (drm_WARN_ON(bo->obj->dev, ++ to_panthor_bo(bo->obj)->exclusive_vm_root_gem != panthor_vm_root_gem(vm))) ++ goto out_free_bo; ++ ++ ret = panthor_vm_unmap_range(vm, bo->va_node.start, ++ panthor_kernel_bo_size(bo)); ++ if (ret) ++ goto out_free_bo; ++ ++ panthor_vm_free_va(vm, &bo->va_node); ++ drm_gem_object_put(bo->obj); ++ ++out_free_bo: ++ kfree(bo); ++} ++ ++/** ++ * panthor_kernel_bo_create() - Create and map a GEM object to a VM ++ * @ptdev: Device. ++ * @vm: VM to map the GEM to. If NULL, the kernel object is not GPU mapped. ++ * @size: Size of the buffer object. ++ * @bo_flags: Combination of drm_panthor_bo_flags flags. ++ * @vm_map_flags: Combination of drm_panthor_vm_bind_op_flags (only those ++ * that are related to map operations). ++ * @gpu_va: GPU address assigned when mapping to the VM. ++ * If gpu_va == PANTHOR_VM_KERNEL_AUTO_VA, the virtual address will be ++ * automatically allocated. ++ * ++ * Return: A valid pointer in case of success, an ERR_PTR() otherwise. ++ */ ++struct panthor_kernel_bo * ++panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm, ++ size_t size, u32 bo_flags, u32 vm_map_flags, ++ u64 gpu_va) ++{ ++ struct drm_gem_shmem_object *obj; ++ struct panthor_kernel_bo *kbo; ++ struct panthor_gem_object *bo; ++ int ret; ++ ++ if (drm_WARN_ON(&ptdev->base, !vm)) ++ return ERR_PTR(-EINVAL); ++ ++ kbo = kzalloc(sizeof(*kbo), GFP_KERNEL); ++ if (!kbo) ++ return ERR_PTR(-ENOMEM); ++ ++ obj = drm_gem_shmem_create(&ptdev->base, size); ++ if (IS_ERR(obj)) { ++ ret = PTR_ERR(obj); ++ goto err_free_bo; ++ } ++ ++ bo = to_panthor_bo(&obj->base); ++ size = obj->base.size; ++ kbo->obj = &obj->base; ++ bo->flags = bo_flags; ++ ++ ret = panthor_vm_alloc_va(vm, gpu_va, size, &kbo->va_node); ++ if (ret) ++ goto err_put_obj; ++ ++ ret = panthor_vm_map_bo_range(vm, bo, 0, size, kbo->va_node.start, vm_map_flags); ++ if (ret) ++ goto err_free_va; ++ ++ bo->exclusive_vm_root_gem = panthor_vm_root_gem(vm); ++ drm_gem_object_get(bo->exclusive_vm_root_gem); ++ bo->base.base.resv = bo->exclusive_vm_root_gem->resv; ++ return kbo; ++ ++err_free_va: ++ panthor_vm_free_va(vm, &kbo->va_node); ++ ++err_put_obj: ++ drm_gem_object_put(&obj->base); ++ ++err_free_bo: ++ kfree(kbo); ++ return ERR_PTR(ret); ++} ++ ++static int panthor_gem_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) ++{ ++ struct panthor_gem_object *bo = to_panthor_bo(obj); ++ ++ /* Don't allow mmap on objects that have the NO_MMAP flag set. */ ++ if (bo->flags & DRM_PANTHOR_BO_NO_MMAP) ++ return -EINVAL; ++ ++ return drm_gem_shmem_object_mmap(obj, vma); ++} ++ ++static struct dma_buf * ++panthor_gem_prime_export(struct drm_gem_object *obj, int flags) ++{ ++ /* We can't export GEMs that have an exclusive VM. */ ++ if (to_panthor_bo(obj)->exclusive_vm_root_gem) ++ return ERR_PTR(-EINVAL); ++ ++ return drm_gem_prime_export(obj, flags); ++} ++ ++static const struct drm_gem_object_funcs panthor_gem_funcs = { ++ .free = panthor_gem_free_object, ++ .print_info = drm_gem_shmem_object_print_info, ++ .pin = drm_gem_shmem_object_pin, ++ .unpin = drm_gem_shmem_object_unpin, ++ .get_sg_table = drm_gem_shmem_object_get_sg_table, ++ .vmap = drm_gem_shmem_object_vmap, ++ .vunmap = drm_gem_shmem_object_vunmap, ++ .mmap = panthor_gem_mmap, ++ .export = panthor_gem_prime_export, ++ .vm_ops = &drm_gem_shmem_vm_ops, ++}; ++ ++/** ++ * panthor_gem_create_object - Implementation of driver->gem_create_object. ++ * @ddev: DRM device ++ * @size: Size in bytes of the memory the object will reference ++ * ++ * This lets the GEM helpers allocate object structs for us, and keep ++ * our BO stats correct. ++ */ ++struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, size_t size) ++{ ++ struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); ++ struct panthor_gem_object *obj; ++ ++ obj = kzalloc(sizeof(*obj), GFP_KERNEL); ++ if (!obj) ++ return ERR_PTR(-ENOMEM); ++ ++ obj->base.base.funcs = &panthor_gem_funcs; ++ obj->base.map_wc = !ptdev->coherent; ++ mutex_init(&obj->gpuva_list_lock); ++ drm_gem_gpuva_set_lock(&obj->base.base, &obj->gpuva_list_lock); ++ ++ return &obj->base.base; ++} ++ ++/** ++ * panthor_gem_create_with_handle() - Create a GEM object and attach it to a handle. ++ * @file: DRM file. ++ * @ddev: DRM device. ++ * @exclusive_vm: Exclusive VM. Not NULL if the GEM object can't be shared. ++ * @size: Size of the GEM object to allocate. ++ * @flags: Combination of drm_panthor_bo_flags flags. ++ * @handle: Pointer holding the handle pointing to the new GEM object. ++ * ++ * Return: Zero on success ++ */ ++int ++panthor_gem_create_with_handle(struct drm_file *file, ++ struct drm_device *ddev, ++ struct panthor_vm *exclusive_vm, ++ u64 *size, u32 flags, u32 *handle) ++{ ++ int ret; ++ struct drm_gem_shmem_object *shmem; ++ struct panthor_gem_object *bo; ++ ++ shmem = drm_gem_shmem_create(ddev, *size); ++ if (IS_ERR(shmem)) ++ return PTR_ERR(shmem); ++ ++ bo = to_panthor_bo(&shmem->base); ++ bo->flags = flags; ++ ++ if (exclusive_vm) { ++ bo->exclusive_vm_root_gem = panthor_vm_root_gem(exclusive_vm); ++ drm_gem_object_get(bo->exclusive_vm_root_gem); ++ bo->base.base.resv = bo->exclusive_vm_root_gem->resv; ++ } ++ ++ /* ++ * Allocate an id of idr table where the obj is registered ++ * and handle has the id what user can see. ++ */ ++ ret = drm_gem_handle_create(file, &shmem->base, handle); ++ if (!ret) ++ *size = bo->base.base.size; ++ ++ /* drop reference from allocate - handle holds it now. */ ++ drm_gem_object_put(&shmem->base); ++ ++ return ret; ++} +diff --git a/drivers/gpu/drm/panthor/panthor_gem.h b/drivers/gpu/drm/panthor/panthor_gem.h +new file mode 100644 +index 000000000000..3bccba394d00 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_gem.h +@@ -0,0 +1,142 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifndef __PANTHOR_GEM_H__ ++#define __PANTHOR_GEM_H__ ++ ++#include ++#include ++ ++#include ++#include ++ ++struct panthor_vm; ++ ++/** ++ * struct panthor_gem_object - Driver specific GEM object. ++ */ ++struct panthor_gem_object { ++ /** @base: Inherit from drm_gem_shmem_object. */ ++ struct drm_gem_shmem_object base; ++ ++ /** ++ * @exclusive_vm_root_gem: Root GEM of the exclusive VM this GEM object ++ * is attached to. ++ * ++ * If @exclusive_vm_root_gem != NULL, any attempt to bind the GEM to a ++ * different VM will fail. ++ * ++ * All FW memory objects have this field set to the root GEM of the MCU ++ * VM. ++ */ ++ struct drm_gem_object *exclusive_vm_root_gem; ++ ++ /** ++ * @gpuva_list_lock: Custom GPUVA lock. ++ * ++ * Used to protect insertion of drm_gpuva elements to the ++ * drm_gem_object.gpuva.list list. ++ * ++ * We can't use the GEM resv for that, because drm_gpuva_link() is ++ * called in a dma-signaling path, where we're not allowed to take ++ * resv locks. ++ */ ++ struct mutex gpuva_list_lock; ++ ++ /** @flags: Combination of drm_panthor_bo_flags flags. */ ++ u32 flags; ++}; ++ ++/** ++ * struct panthor_kernel_bo - Kernel buffer object. ++ * ++ * These objects are only manipulated by the kernel driver and not ++ * directly exposed to the userspace. The GPU address of a kernel ++ * BO might be passed to userspace though. ++ */ ++struct panthor_kernel_bo { ++ /** ++ * @obj: The GEM object backing this kernel buffer object. ++ */ ++ struct drm_gem_object *obj; ++ ++ /** ++ * @va_node: VA space allocated to this GEM. ++ */ ++ struct drm_mm_node va_node; ++ ++ /** ++ * @kmap: Kernel CPU mapping of @gem. ++ */ ++ void *kmap; ++}; ++ ++static inline ++struct panthor_gem_object *to_panthor_bo(struct drm_gem_object *obj) ++{ ++ return container_of(to_drm_gem_shmem_obj(obj), struct panthor_gem_object, base); ++} ++ ++struct drm_gem_object *panthor_gem_create_object(struct drm_device *ddev, size_t size); ++ ++struct drm_gem_object * ++panthor_gem_prime_import_sg_table(struct drm_device *ddev, ++ struct dma_buf_attachment *attach, ++ struct sg_table *sgt); ++ ++int ++panthor_gem_create_with_handle(struct drm_file *file, ++ struct drm_device *ddev, ++ struct panthor_vm *exclusive_vm, ++ u64 *size, u32 flags, uint32_t *handle); ++ ++static inline u64 ++panthor_kernel_bo_gpuva(struct panthor_kernel_bo *bo) ++{ ++ return bo->va_node.start; ++} ++ ++static inline size_t ++panthor_kernel_bo_size(struct panthor_kernel_bo *bo) ++{ ++ return bo->obj->size; ++} ++ ++static inline int ++panthor_kernel_bo_vmap(struct panthor_kernel_bo *bo) ++{ ++ struct iosys_map map; ++ int ret; ++ ++ if (bo->kmap) ++ return 0; ++ ++ ret = drm_gem_vmap_unlocked(bo->obj, &map); ++ if (ret) ++ return ret; ++ ++ bo->kmap = map.vaddr; ++ return 0; ++} ++ ++static inline void ++panthor_kernel_bo_vunmap(struct panthor_kernel_bo *bo) ++{ ++ if (bo->kmap) { ++ struct iosys_map map = IOSYS_MAP_INIT_VADDR(bo->kmap); ++ ++ drm_gem_vunmap_unlocked(bo->obj, &map); ++ bo->kmap = NULL; ++ } ++} ++ ++struct panthor_kernel_bo * ++panthor_kernel_bo_create(struct panthor_device *ptdev, struct panthor_vm *vm, ++ size_t size, u32 bo_flags, u32 vm_map_flags, ++ u64 gpu_va); ++ ++void panthor_kernel_bo_destroy(struct panthor_vm *vm, ++ struct panthor_kernel_bo *bo); ++ ++#endif /* __PANTHOR_GEM_H__ */ +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:20 +0100 +Subject: drm/panthor: Add the devfreq logical block + +Every thing related to devfreq in placed in panthor_devfreq.c, and +helpers that can be called by other logical blocks are exposed through +panthor_devfreq.h. + +This implementation is loosely based on the panfrost implementation, +the only difference being that we don't count device users, because +the idle/active state will be managed by the scheduler logic. + +v6: +- Add Maxime's and Heiko's acks +- Keep header inclusion alphabetically ordered + +v4: +- Add Clement's A-b for the relicensing + +v3: +- Add acks for the MIT/GPL2 relicensing + +v2: +- Added in v2 + +Cc: Clement Peron # MIT+GPL2 relicensing +Reviewed-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Acked-by: Clement Peron # MIT+GPL2 relicensing +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_devfreq.c | 283 ++++++++++ + drivers/gpu/drm/panthor/panthor_devfreq.h | 21 + + 2 files changed, 304 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_devfreq.c b/drivers/gpu/drm/panthor/panthor_devfreq.c +new file mode 100644 +index 000000000000..7ac4fa290f27 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_devfreq.c +@@ -0,0 +1,283 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2019 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "panthor_devfreq.h" ++#include "panthor_device.h" ++ ++/** ++ * struct panthor_devfreq - Device frequency management ++ */ ++struct panthor_devfreq { ++ /** @devfreq: devfreq device. */ ++ struct devfreq *devfreq; ++ ++ /** @gov_data: Governor data. */ ++ struct devfreq_simple_ondemand_data gov_data; ++ ++ /** @busy_time: Busy time. */ ++ ktime_t busy_time; ++ ++ /** @idle_time: Idle time. */ ++ ktime_t idle_time; ++ ++ /** @time_last_update: Last update time. */ ++ ktime_t time_last_update; ++ ++ /** @last_busy_state: True if the GPU was busy last time we updated the state. */ ++ bool last_busy_state; ++ ++ /* ++ * @lock: Lock used to protect busy_time, idle_time, time_last_update and ++ * last_busy_state. ++ * ++ * These fields can be accessed concurrently by panthor_devfreq_get_dev_status() ++ * and panthor_devfreq_record_{busy,idle}(). ++ */ ++ spinlock_t lock; ++}; ++ ++static void panthor_devfreq_update_utilization(struct panthor_devfreq *pdevfreq) ++{ ++ ktime_t now, last; ++ ++ now = ktime_get(); ++ last = pdevfreq->time_last_update; ++ ++ if (pdevfreq->last_busy_state) ++ pdevfreq->busy_time += ktime_sub(now, last); ++ else ++ pdevfreq->idle_time += ktime_sub(now, last); ++ ++ pdevfreq->time_last_update = now; ++} ++ ++static int panthor_devfreq_target(struct device *dev, unsigned long *freq, ++ u32 flags) ++{ ++ struct dev_pm_opp *opp; ++ ++ opp = devfreq_recommended_opp(dev, freq, flags); ++ if (IS_ERR(opp)) ++ return PTR_ERR(opp); ++ dev_pm_opp_put(opp); ++ ++ return dev_pm_opp_set_rate(dev, *freq); ++} ++ ++static void panthor_devfreq_reset(struct panthor_devfreq *pdevfreq) ++{ ++ pdevfreq->busy_time = 0; ++ pdevfreq->idle_time = 0; ++ pdevfreq->time_last_update = ktime_get(); ++} ++ ++static int panthor_devfreq_get_dev_status(struct device *dev, ++ struct devfreq_dev_status *status) ++{ ++ struct panthor_device *ptdev = dev_get_drvdata(dev); ++ struct panthor_devfreq *pdevfreq = ptdev->devfreq; ++ unsigned long irqflags; ++ ++ status->current_frequency = clk_get_rate(ptdev->clks.core); ++ ++ spin_lock_irqsave(&pdevfreq->lock, irqflags); ++ ++ panthor_devfreq_update_utilization(pdevfreq); ++ ++ status->total_time = ktime_to_ns(ktime_add(pdevfreq->busy_time, ++ pdevfreq->idle_time)); ++ ++ status->busy_time = ktime_to_ns(pdevfreq->busy_time); ++ ++ panthor_devfreq_reset(pdevfreq); ++ ++ spin_unlock_irqrestore(&pdevfreq->lock, irqflags); ++ ++ drm_dbg(&ptdev->base, "busy %lu total %lu %lu %% freq %lu MHz\n", ++ status->busy_time, status->total_time, ++ status->busy_time / (status->total_time / 100), ++ status->current_frequency / 1000 / 1000); ++ ++ return 0; ++} ++ ++static struct devfreq_dev_profile panthor_devfreq_profile = { ++ .timer = DEVFREQ_TIMER_DELAYED, ++ .polling_ms = 50, /* ~3 frames */ ++ .target = panthor_devfreq_target, ++ .get_dev_status = panthor_devfreq_get_dev_status, ++}; ++ ++int panthor_devfreq_init(struct panthor_device *ptdev) ++{ ++ /* There's actually 2 regulators (mali and sram), but the OPP core only ++ * supports one. ++ * ++ * We assume the sram regulator is coupled with the mali one and let ++ * the coupling logic deal with voltage updates. ++ */ ++ static const char * const reg_names[] = { "mali", NULL }; ++ struct thermal_cooling_device *cooling; ++ struct device *dev = ptdev->base.dev; ++ struct panthor_devfreq *pdevfreq; ++ struct dev_pm_opp *opp; ++ unsigned long cur_freq; ++ int ret; ++ ++ pdevfreq = drmm_kzalloc(&ptdev->base, sizeof(*ptdev->devfreq), GFP_KERNEL); ++ if (!pdevfreq) ++ return -ENOMEM; ++ ++ ptdev->devfreq = pdevfreq; ++ ++ ret = devm_pm_opp_set_regulators(dev, reg_names); ++ if (ret) { ++ if (ret != -EPROBE_DEFER) ++ DRM_DEV_ERROR(dev, "Couldn't set OPP regulators\n"); ++ ++ return ret; ++ } ++ ++ ret = devm_pm_opp_of_add_table(dev); ++ if (ret) ++ return ret; ++ ++ spin_lock_init(&pdevfreq->lock); ++ ++ panthor_devfreq_reset(pdevfreq); ++ ++ cur_freq = clk_get_rate(ptdev->clks.core); ++ ++ opp = devfreq_recommended_opp(dev, &cur_freq, 0); ++ if (IS_ERR(opp)) ++ return PTR_ERR(opp); ++ ++ panthor_devfreq_profile.initial_freq = cur_freq; ++ ++ /* Regulator coupling only takes care of synchronizing/balancing voltage ++ * updates, but the coupled regulator needs to be enabled manually. ++ * ++ * We use devm_regulator_get_enable_optional() and keep the sram supply ++ * enabled until the device is removed, just like we do for the mali ++ * supply, which is enabled when dev_pm_opp_set_opp(dev, opp) is called, ++ * and disabled when the opp_table is torn down, using the devm action. ++ * ++ * If we really care about disabling regulators on suspend, we should: ++ * - use devm_regulator_get_optional() here ++ * - call dev_pm_opp_set_opp(dev, NULL) before leaving this function ++ * (this disables the regulator passed to the OPP layer) ++ * - call dev_pm_opp_set_opp(dev, NULL) and ++ * regulator_disable(ptdev->regulators.sram) in ++ * panthor_devfreq_suspend() ++ * - call dev_pm_opp_set_opp(dev, default_opp) and ++ * regulator_enable(ptdev->regulators.sram) in ++ * panthor_devfreq_resume() ++ * ++ * But without knowing if it's beneficial or not (in term of power ++ * consumption), or how much it slows down the suspend/resume steps, ++ * let's just keep regulators enabled for the device lifetime. ++ */ ++ ret = devm_regulator_get_enable_optional(dev, "sram"); ++ if (ret && ret != -ENODEV) { ++ if (ret != -EPROBE_DEFER) ++ DRM_DEV_ERROR(dev, "Couldn't retrieve/enable sram supply\n"); ++ return ret; ++ } ++ ++ /* ++ * Set the recommend OPP this will enable and configure the regulator ++ * if any and will avoid a switch off by regulator_late_cleanup() ++ */ ++ ret = dev_pm_opp_set_opp(dev, opp); ++ if (ret) { ++ DRM_DEV_ERROR(dev, "Couldn't set recommended OPP\n"); ++ return ret; ++ } ++ ++ dev_pm_opp_put(opp); ++ ++ /* ++ * Setup default thresholds for the simple_ondemand governor. ++ * The values are chosen based on experiments. ++ */ ++ pdevfreq->gov_data.upthreshold = 45; ++ pdevfreq->gov_data.downdifferential = 5; ++ ++ pdevfreq->devfreq = devm_devfreq_add_device(dev, &panthor_devfreq_profile, ++ DEVFREQ_GOV_SIMPLE_ONDEMAND, ++ &pdevfreq->gov_data); ++ if (IS_ERR(pdevfreq->devfreq)) { ++ DRM_DEV_ERROR(dev, "Couldn't initialize GPU devfreq\n"); ++ ret = PTR_ERR(pdevfreq->devfreq); ++ pdevfreq->devfreq = NULL; ++ return ret; ++ } ++ ++ cooling = devfreq_cooling_em_register(pdevfreq->devfreq, NULL); ++ if (IS_ERR(cooling)) ++ DRM_DEV_INFO(dev, "Failed to register cooling device\n"); ++ ++ return 0; ++} ++ ++int panthor_devfreq_resume(struct panthor_device *ptdev) ++{ ++ struct panthor_devfreq *pdevfreq = ptdev->devfreq; ++ ++ if (!pdevfreq->devfreq) ++ return 0; ++ ++ panthor_devfreq_reset(pdevfreq); ++ ++ return devfreq_resume_device(pdevfreq->devfreq); ++} ++ ++int panthor_devfreq_suspend(struct panthor_device *ptdev) ++{ ++ struct panthor_devfreq *pdevfreq = ptdev->devfreq; ++ ++ if (!pdevfreq->devfreq) ++ return 0; ++ ++ return devfreq_suspend_device(pdevfreq->devfreq); ++} ++ ++void panthor_devfreq_record_busy(struct panthor_device *ptdev) ++{ ++ struct panthor_devfreq *pdevfreq = ptdev->devfreq; ++ unsigned long irqflags; ++ ++ if (!pdevfreq->devfreq) ++ return; ++ ++ spin_lock_irqsave(&pdevfreq->lock, irqflags); ++ ++ panthor_devfreq_update_utilization(pdevfreq); ++ pdevfreq->last_busy_state = true; ++ ++ spin_unlock_irqrestore(&pdevfreq->lock, irqflags); ++} ++ ++void panthor_devfreq_record_idle(struct panthor_device *ptdev) ++{ ++ struct panthor_devfreq *pdevfreq = ptdev->devfreq; ++ unsigned long irqflags; ++ ++ if (!pdevfreq->devfreq) ++ return; ++ ++ spin_lock_irqsave(&pdevfreq->lock, irqflags); ++ ++ panthor_devfreq_update_utilization(pdevfreq); ++ pdevfreq->last_busy_state = false; ++ ++ spin_unlock_irqrestore(&pdevfreq->lock, irqflags); ++} +diff --git a/drivers/gpu/drm/panthor/panthor_devfreq.h b/drivers/gpu/drm/panthor/panthor_devfreq.h +new file mode 100644 +index 000000000000..83a5c9522493 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_devfreq.h +@@ -0,0 +1,21 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2019 Collabora ltd. */ ++ ++#ifndef __PANTHOR_DEVFREQ_H__ ++#define __PANTHOR_DEVFREQ_H__ ++ ++struct devfreq; ++struct thermal_cooling_device; ++ ++struct panthor_device; ++struct panthor_devfreq; ++ ++int panthor_devfreq_init(struct panthor_device *ptdev); ++ ++int panthor_devfreq_resume(struct panthor_device *ptdev); ++int panthor_devfreq_suspend(struct panthor_device *ptdev); ++ ++void panthor_devfreq_record_busy(struct panthor_device *ptdev); ++void panthor_devfreq_record_idle(struct panthor_device *ptdev); ++ ++#endif /* __PANTHOR_DEVFREQ_H__ */ +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:21 +0100 +Subject: drm/panthor: Add the MMU/VM logical block + +MMU and VM management is related and placed in the same source file. + +Page table updates are delegated to the io-pgtable-arm driver that's in +the iommu subsystem. + +The VM management logic is based on drm_gpuva_mgr, and is assuming the +VA space is mostly managed by the usermode driver, except for a reserved +portion of this VA-space that's used for kernel objects (like the heap +contexts/chunks). + +Both asynchronous and synchronous VM operations are supported, and +internal helpers are exposed to allow other logical blocks to map their +buffers in the GPU VA space. + +There's one VM_BIND queue per-VM (meaning the Vulkan driver can only +expose one sparse-binding queue), and this bind queue is managed with +a 1:1 drm_sched_entity:drm_gpu_scheduler, such that each VM gets its own +independent execution queue, avoiding VM operation serialization at the +device level (things are still serialized at the VM level). + +The rest is just implementation details that are hopefully well explained +in the documentation. + +v6: +- Add Maxime's and Heiko's acks +- Add Steve's R-b +- Adjust the TRANSCFG value to account for SW VA space limitation on + 32-bit systems +- Keep header inclusion alphabetically ordered + +v5: +- Fix a double panthor_vm_cleanup_op_ctx() call +- Fix a race between panthor_vm_prepare_map_op_ctx() and + panthor_vm_bo_put() +- Fix panthor_vm_pool_destroy_vm() kernel doc +- Fix paddr adjustment in panthor_vm_map_pages() +- Fix bo_offset calculation in panthor_vm_get_bo_for_va() + +v4: +- Add an helper to return the VM state +- Check drmm_mutex_init() return code +- Remove the VM from the AS reclaim list when panthor_vm_active() is + called +- Count the number of active VM users instead of considering there's + at most one user (several scheduling groups can point to the same + vM) +- Pre-allocate a VMA object for unmap operations (unmaps can trigger + a sm_step_remap() call) +- Check vm->root_page_table instead of vm->pgtbl_ops to detect if + the io-pgtable is trying to allocate the root page table +- Don't memset() the va_node in panthor_vm_alloc_va(), make it a + caller requirement +- Fix the kernel doc in a few places +- Drop the panthor_vm::base offset constraint and modify + panthor_vm_put() to explicitly check for a NULL value +- Fix unbalanced vm_bo refcount in panthor_gpuva_sm_step_remap() +- Drop stale comments about the shared_bos list +- Patch mmu_features::va_bits on 32-bit builds to reflect the + io_pgtable limitation and let the UMD know about it + +v3: +- Add acks for the MIT/GPL2 relicensing +- Propagate MMU faults to the scheduler +- Move pages pinning/unpinning out of the dma_signalling path +- Fix 32-bit support +- Rework the user/kernel VA range calculation +- Make the auto-VA range explicit (auto-VA range doesn't cover the full + kernel-VA range on the MCU VM) +- Let callers of panthor_vm_alloc_va() allocate the drm_mm_node + (embedded in panthor_kernel_bo now) +- Adjust things to match the latest drm_gpuvm changes (extobj tracking, + resv prep and more) +- Drop the per-AS lock and use slots_lock (fixes a race on vm->as.id) +- Set as.id to -1 when reusing an address space from the LRU list +- Drop misleading comment about page faults +- Remove check for irq being assigned in panthor_mmu_unplug() + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_mmu.c | 2768 ++++++++++ + drivers/gpu/drm/panthor/panthor_mmu.h | 102 + + 2 files changed, 2870 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c +new file mode 100644 +index 000000000000..fdd35249169f +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_mmu.c +@@ -0,0 +1,2768 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "panthor_device.h" ++#include "panthor_gem.h" ++#include "panthor_heap.h" ++#include "panthor_mmu.h" ++#include "panthor_regs.h" ++#include "panthor_sched.h" ++ ++#define MAX_AS_SLOTS 32 ++ ++struct panthor_vm; ++ ++/** ++ * struct panthor_as_slot - Address space slot ++ */ ++struct panthor_as_slot { ++ /** @vm: VM bound to this slot. NULL is no VM is bound. */ ++ struct panthor_vm *vm; ++}; ++ ++/** ++ * struct panthor_mmu - MMU related data ++ */ ++struct panthor_mmu { ++ /** @irq: The MMU irq. */ ++ struct panthor_irq irq; ++ ++ /** @as: Address space related fields. ++ * ++ * The GPU has a limited number of address spaces (AS) slots, forcing ++ * us to re-assign them to re-assign slots on-demand. ++ */ ++ struct { ++ /** @slots_lock: Lock protecting access to all other AS fields. */ ++ struct mutex slots_lock; ++ ++ /** @alloc_mask: Bitmask encoding the allocated slots. */ ++ unsigned long alloc_mask; ++ ++ /** @faulty_mask: Bitmask encoding the faulty slots. */ ++ unsigned long faulty_mask; ++ ++ /** @slots: VMs currently bound to the AS slots. */ ++ struct panthor_as_slot slots[MAX_AS_SLOTS]; ++ ++ /** ++ * @lru_list: List of least recently used VMs. ++ * ++ * We use this list to pick a VM to evict when all slots are ++ * used. ++ * ++ * There should be no more active VMs than there are AS slots, ++ * so this LRU is just here to keep VMs bound until there's ++ * a need to release a slot, thus avoid unnecessary TLB/cache ++ * flushes. ++ */ ++ struct list_head lru_list; ++ } as; ++ ++ /** @vm: VMs management fields */ ++ struct { ++ /** @lock: Lock protecting access to list. */ ++ struct mutex lock; ++ ++ /** @list: List containing all VMs. */ ++ struct list_head list; ++ ++ /** @reset_in_progress: True if a reset is in progress. */ ++ bool reset_in_progress; ++ ++ /** @wq: Workqueue used for the VM_BIND queues. */ ++ struct workqueue_struct *wq; ++ } vm; ++}; ++ ++/** ++ * struct panthor_vm_pool - VM pool object ++ */ ++struct panthor_vm_pool { ++ /** @xa: Array used for VM handle tracking. */ ++ struct xarray xa; ++}; ++ ++/** ++ * struct panthor_vma - GPU mapping object ++ * ++ * This is used to track GEM mappings in GPU space. ++ */ ++struct panthor_vma { ++ /** @base: Inherits from drm_gpuva. */ ++ struct drm_gpuva base; ++ ++ /** @node: Used to implement deferred release of VMAs. */ ++ struct list_head node; ++ ++ /** ++ * @flags: Combination of drm_panthor_vm_bind_op_flags. ++ * ++ * Only map related flags are accepted. ++ */ ++ u32 flags; ++}; ++ ++/** ++ * struct panthor_vm_op_ctx - VM operation context ++ * ++ * With VM operations potentially taking place in a dma-signaling path, we ++ * need to make sure everything that might require resource allocation is ++ * pre-allocated upfront. This is what this operation context is far. ++ * ++ * We also collect resources that have been freed, so we can release them ++ * asynchronously, and let the VM_BIND scheduler process the next VM_BIND ++ * request. ++ */ ++struct panthor_vm_op_ctx { ++ /** @rsvd_page_tables: Pages reserved for the MMU page table update. */ ++ struct { ++ /** @count: Number of pages reserved. */ ++ u32 count; ++ ++ /** @ptr: Point to the first unused page in the @pages table. */ ++ u32 ptr; ++ ++ /** ++ * @page: Array of pages that can be used for an MMU page table update. ++ * ++ * After an VM operation, there might be free pages left in this array. ++ * They should be returned to the pt_cache as part of the op_ctx cleanup. ++ */ ++ void **pages; ++ } rsvd_page_tables; ++ ++ /** ++ * @preallocated_vmas: Pre-allocated VMAs to handle the remap case. ++ * ++ * Partial unmap requests or map requests overlapping existing mappings will ++ * trigger a remap call, which need to register up to three panthor_vma objects ++ * (one for the new mapping, and two for the previous and next mappings). ++ */ ++ struct panthor_vma *preallocated_vmas[3]; ++ ++ /** @flags: Combination of drm_panthor_vm_bind_op_flags. */ ++ u32 flags; ++ ++ /** @va: Virtual range targeted by the VM operation. */ ++ struct { ++ /** @addr: Start address. */ ++ u64 addr; ++ ++ /** @range: Range size. */ ++ u64 range; ++ } va; ++ ++ /** ++ * @returned_vmas: List of panthor_vma objects returned after a VM operation. ++ * ++ * For unmap operations, this will contain all VMAs that were covered by the ++ * specified VA range. ++ * ++ * For map operations, this will contain all VMAs that previously mapped to ++ * the specified VA range. ++ * ++ * Those VMAs, and the resources they point to will be released as part of ++ * the op_ctx cleanup operation. ++ */ ++ struct list_head returned_vmas; ++ ++ /** @map: Fields specific to a map operation. */ ++ struct { ++ /** @vm_bo: Buffer object to map. */ ++ struct drm_gpuvm_bo *vm_bo; ++ ++ /** @bo_offset: Offset in the buffer object. */ ++ u64 bo_offset; ++ ++ /** ++ * @sgt: sg-table pointing to pages backing the GEM object. ++ * ++ * This is gathered at job creation time, such that we don't have ++ * to allocate in ::run_job(). ++ */ ++ struct sg_table *sgt; ++ ++ /** ++ * @new_vma: The new VMA object that will be inserted to the VA tree. ++ */ ++ struct panthor_vma *new_vma; ++ } map; ++}; ++ ++/** ++ * struct panthor_vm - VM object ++ * ++ * A VM is an object representing a GPU (or MCU) virtual address space. ++ * It embeds the MMU page table for this address space, a tree containing ++ * all the virtual mappings of GEM objects, and other things needed to manage ++ * the VM. ++ * ++ * Except for the MCU VM, which is managed by the kernel, all other VMs are ++ * created by userspace and mostly managed by userspace, using the ++ * %DRM_IOCTL_PANTHOR_VM_BIND ioctl. ++ * ++ * A portion of the virtual address space is reserved for kernel objects, ++ * like heap chunks, and userspace gets to decide how much of the virtual ++ * address space is left to the kernel (half of the virtual address space ++ * by default). ++ */ ++struct panthor_vm { ++ /** ++ * @base: Inherit from drm_gpuvm. ++ * ++ * We delegate all the VA management to the common drm_gpuvm framework ++ * and only implement hooks to update the MMU page table. ++ */ ++ struct drm_gpuvm base; ++ ++ /** ++ * @sched: Scheduler used for asynchronous VM_BIND request. ++ * ++ * We use a 1:1 scheduler here. ++ */ ++ struct drm_gpu_scheduler sched; ++ ++ /** ++ * @entity: Scheduling entity representing the VM_BIND queue. ++ * ++ * There's currently one bind queue per VM. It doesn't make sense to ++ * allow more given the VM operations are serialized anyway. ++ */ ++ struct drm_sched_entity entity; ++ ++ /** @ptdev: Device. */ ++ struct panthor_device *ptdev; ++ ++ /** @memattr: Value to program to the AS_MEMATTR register. */ ++ u64 memattr; ++ ++ /** @pgtbl_ops: Page table operations. */ ++ struct io_pgtable_ops *pgtbl_ops; ++ ++ /** @root_page_table: Stores the root page table pointer. */ ++ void *root_page_table; ++ ++ /** ++ * @op_lock: Lock used to serialize operations on a VM. ++ * ++ * The serialization of jobs queued to the VM_BIND queue is already ++ * taken care of by drm_sched, but we need to serialize synchronous ++ * and asynchronous VM_BIND request. This is what this lock is for. ++ */ ++ struct mutex op_lock; ++ ++ /** ++ * @op_ctx: The context attached to the currently executing VM operation. ++ * ++ * NULL when no operation is in progress. ++ */ ++ struct panthor_vm_op_ctx *op_ctx; ++ ++ /** ++ * @mm: Memory management object representing the auto-VA/kernel-VA. ++ * ++ * Used to auto-allocate VA space for kernel-managed objects (tiler ++ * heaps, ...). ++ * ++ * For the MCU VM, this is managing the VA range that's used to map ++ * all shared interfaces. ++ * ++ * For user VMs, the range is specified by userspace, and must not ++ * exceed half of the VA space addressable. ++ */ ++ struct drm_mm mm; ++ ++ /** @mm_lock: Lock protecting the @mm field. */ ++ struct mutex mm_lock; ++ ++ /** @kernel_auto_va: Automatic VA-range for kernel BOs. */ ++ struct { ++ /** @start: Start of the automatic VA-range for kernel BOs. */ ++ u64 start; ++ ++ /** @size: Size of the automatic VA-range for kernel BOs. */ ++ u64 end; ++ } kernel_auto_va; ++ ++ /** @as: Address space related fields. */ ++ struct { ++ /** ++ * @id: ID of the address space this VM is bound to. ++ * ++ * A value of -1 means the VM is inactive/not bound. ++ */ ++ int id; ++ ++ /** @active_cnt: Number of active users of this VM. */ ++ refcount_t active_cnt; ++ ++ /** ++ * @lru_node: Used to instead the VM in the panthor_mmu::as::lru_list. ++ * ++ * Active VMs should not be inserted in the LRU list. ++ */ ++ struct list_head lru_node; ++ } as; ++ ++ /** ++ * @heaps: Tiler heap related fields. ++ */ ++ struct { ++ /** ++ * @pool: The heap pool attached to this VM. ++ * ++ * Will stay NULL until someone creates a heap context on this VM. ++ */ ++ struct panthor_heap_pool *pool; ++ ++ /** @lock: Lock used to protect access to @pool. */ ++ struct mutex lock; ++ } heaps; ++ ++ /** @node: Used to insert the VM in the panthor_mmu::vm::list. */ ++ struct list_head node; ++ ++ /** @for_mcu: True if this is the MCU VM. */ ++ bool for_mcu; ++ ++ /** ++ * @destroyed: True if the VM was destroyed. ++ * ++ * No further bind requests should be queued to a destroyed VM. ++ */ ++ bool destroyed; ++ ++ /** ++ * @unusable: True if the VM has turned unusable because something ++ * bad happened during an asynchronous request. ++ * ++ * We don't try to recover from such failures, because this implies ++ * informing userspace about the specific operation that failed, and ++ * hoping the userspace driver can replay things from there. This all ++ * sounds very complicated for little gain. ++ * ++ * Instead, we should just flag the VM as unusable, and fail any ++ * further request targeting this VM. ++ * ++ * We also provide a way to query a VM state, so userspace can destroy ++ * it and create a new one. ++ * ++ * As an analogy, this would be mapped to a VK_ERROR_DEVICE_LOST ++ * situation, where the logical device needs to be re-created. ++ */ ++ bool unusable; ++ ++ /** ++ * @unhandled_fault: Unhandled fault happened. ++ * ++ * This should be reported to the scheduler, and the queue/group be ++ * flagged as faulty as a result. ++ */ ++ bool unhandled_fault; ++}; ++ ++/** ++ * struct panthor_vm_bind_job - VM bind job ++ */ ++struct panthor_vm_bind_job { ++ /** @base: Inherit from drm_sched_job. */ ++ struct drm_sched_job base; ++ ++ /** @refcount: Reference count. */ ++ struct kref refcount; ++ ++ /** @cleanup_op_ctx_work: Work used to cleanup the VM operation context. */ ++ struct work_struct cleanup_op_ctx_work; ++ ++ /** @vm: VM targeted by the VM operation. */ ++ struct panthor_vm *vm; ++ ++ /** @ctx: Operation context. */ ++ struct panthor_vm_op_ctx ctx; ++}; ++ ++/** ++ * @pt_cache: Cache used to allocate MMU page tables. ++ * ++ * The pre-allocation pattern forces us to over-allocate to plan for ++ * the worst case scenario, and return the pages we didn't use. ++ * ++ * Having a kmem_cache allows us to speed allocations. ++ */ ++static struct kmem_cache *pt_cache; ++ ++/** ++ * alloc_pt() - Custom page table allocator ++ * @cookie: Cookie passed at page table allocation time. ++ * @size: Size of the page table. This size should be fixed, ++ * and determined at creation time based on the granule size. ++ * @gfp: GFP flags. ++ * ++ * We want a custom allocator so we can use a cache for page table ++ * allocations and amortize the cost of the over-reservation that's ++ * done to allow asynchronous VM operations. ++ * ++ * Return: non-NULL on success, NULL if the allocation failed for any ++ * reason. ++ */ ++static void *alloc_pt(void *cookie, size_t size, gfp_t gfp) ++{ ++ struct panthor_vm *vm = cookie; ++ void *page; ++ ++ /* Allocation of the root page table happening during init. */ ++ if (unlikely(!vm->root_page_table)) { ++ struct page *p; ++ ++ drm_WARN_ON(&vm->ptdev->base, vm->op_ctx); ++ p = alloc_pages_node(dev_to_node(vm->ptdev->base.dev), ++ gfp | __GFP_ZERO, get_order(size)); ++ page = p ? page_address(p) : NULL; ++ vm->root_page_table = page; ++ return page; ++ } ++ ++ /* We're not supposed to have anything bigger than 4k here, because we picked a ++ * 4k granule size at init time. ++ */ ++ if (drm_WARN_ON(&vm->ptdev->base, size != SZ_4K)) ++ return NULL; ++ ++ /* We must have some op_ctx attached to the VM and it must have at least one ++ * free page. ++ */ ++ if (drm_WARN_ON(&vm->ptdev->base, !vm->op_ctx) || ++ drm_WARN_ON(&vm->ptdev->base, ++ vm->op_ctx->rsvd_page_tables.ptr >= vm->op_ctx->rsvd_page_tables.count)) ++ return NULL; ++ ++ page = vm->op_ctx->rsvd_page_tables.pages[vm->op_ctx->rsvd_page_tables.ptr++]; ++ memset(page, 0, SZ_4K); ++ ++ /* Page table entries don't use virtual addresses, which trips out ++ * kmemleak. kmemleak_alloc_phys() might work, but physical addresses ++ * are mixed with other fields, and I fear kmemleak won't detect that ++ * either. ++ * ++ * Let's just ignore memory passed to the page-table driver for now. ++ */ ++ kmemleak_ignore(page); ++ return page; ++} ++ ++/** ++ * @free_pt() - Custom page table free function ++ * @cookie: Cookie passed at page table allocation time. ++ * @data: Page table to free. ++ * @size: Size of the page table. This size should be fixed, ++ * and determined at creation time based on the granule size. ++ */ ++static void free_pt(void *cookie, void *data, size_t size) ++{ ++ struct panthor_vm *vm = cookie; ++ ++ if (unlikely(vm->root_page_table == data)) { ++ free_pages((unsigned long)data, get_order(size)); ++ vm->root_page_table = NULL; ++ return; ++ } ++ ++ if (drm_WARN_ON(&vm->ptdev->base, size != SZ_4K)) ++ return; ++ ++ /* Return the page to the pt_cache. */ ++ kmem_cache_free(pt_cache, data); ++} ++ ++static int wait_ready(struct panthor_device *ptdev, u32 as_nr) ++{ ++ int ret; ++ u32 val; ++ ++ /* Wait for the MMU status to indicate there is no active command, in ++ * case one is pending. ++ */ ++ ret = readl_relaxed_poll_timeout_atomic(ptdev->iomem + AS_STATUS(as_nr), ++ val, !(val & AS_STATUS_AS_ACTIVE), ++ 10, 100000); ++ ++ if (ret) { ++ panthor_device_schedule_reset(ptdev); ++ drm_err(&ptdev->base, "AS_ACTIVE bit stuck\n"); ++ } ++ ++ return ret; ++} ++ ++static int write_cmd(struct panthor_device *ptdev, u32 as_nr, u32 cmd) ++{ ++ int status; ++ ++ /* write AS_COMMAND when MMU is ready to accept another command */ ++ status = wait_ready(ptdev, as_nr); ++ if (!status) ++ gpu_write(ptdev, AS_COMMAND(as_nr), cmd); ++ ++ return status; ++} ++ ++static void lock_region(struct panthor_device *ptdev, u32 as_nr, ++ u64 region_start, u64 size) ++{ ++ u8 region_width; ++ u64 region; ++ u64 region_end = region_start + size; ++ ++ if (!size) ++ return; ++ ++ /* ++ * The locked region is a naturally aligned power of 2 block encoded as ++ * log2 minus(1). ++ * Calculate the desired start/end and look for the highest bit which ++ * differs. The smallest naturally aligned block must include this bit ++ * change, the desired region starts with this bit (and subsequent bits) ++ * zeroed and ends with the bit (and subsequent bits) set to one. ++ */ ++ region_width = max(fls64(region_start ^ (region_end - 1)), ++ const_ilog2(AS_LOCK_REGION_MIN_SIZE)) - 1; ++ ++ /* ++ * Mask off the low bits of region_start (which would be ignored by ++ * the hardware anyway) ++ */ ++ region_start &= GENMASK_ULL(63, region_width); ++ ++ region = region_width | region_start; ++ ++ /* Lock the region that needs to be updated */ ++ gpu_write(ptdev, AS_LOCKADDR_LO(as_nr), lower_32_bits(region)); ++ gpu_write(ptdev, AS_LOCKADDR_HI(as_nr), upper_32_bits(region)); ++ write_cmd(ptdev, as_nr, AS_COMMAND_LOCK); ++} ++ ++static int mmu_hw_do_operation_locked(struct panthor_device *ptdev, int as_nr, ++ u64 iova, u64 size, u32 op) ++{ ++ lockdep_assert_held(&ptdev->mmu->as.slots_lock); ++ ++ if (as_nr < 0) ++ return 0; ++ ++ if (op != AS_COMMAND_UNLOCK) ++ lock_region(ptdev, as_nr, iova, size); ++ ++ /* Run the MMU operation */ ++ write_cmd(ptdev, as_nr, op); ++ ++ /* Wait for the flush to complete */ ++ return wait_ready(ptdev, as_nr); ++} ++ ++static int mmu_hw_do_operation(struct panthor_vm *vm, ++ u64 iova, u64 size, u32 op) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ int ret; ++ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ ret = mmu_hw_do_operation_locked(ptdev, vm->as.id, iova, size, op); ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++ return ret; ++} ++ ++static int panthor_mmu_as_enable(struct panthor_device *ptdev, u32 as_nr, ++ u64 transtab, u64 transcfg, u64 memattr) ++{ ++ int ret; ++ ++ ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM); ++ if (ret) ++ return ret; ++ ++ gpu_write(ptdev, AS_TRANSTAB_LO(as_nr), lower_32_bits(transtab)); ++ gpu_write(ptdev, AS_TRANSTAB_HI(as_nr), upper_32_bits(transtab)); ++ ++ gpu_write(ptdev, AS_MEMATTR_LO(as_nr), lower_32_bits(memattr)); ++ gpu_write(ptdev, AS_MEMATTR_HI(as_nr), upper_32_bits(memattr)); ++ ++ gpu_write(ptdev, AS_TRANSCFG_LO(as_nr), lower_32_bits(transcfg)); ++ gpu_write(ptdev, AS_TRANSCFG_HI(as_nr), upper_32_bits(transcfg)); ++ ++ return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE); ++} ++ ++static int panthor_mmu_as_disable(struct panthor_device *ptdev, u32 as_nr) ++{ ++ int ret; ++ ++ ret = mmu_hw_do_operation_locked(ptdev, as_nr, 0, ~0ULL, AS_COMMAND_FLUSH_MEM); ++ if (ret) ++ return ret; ++ ++ gpu_write(ptdev, AS_TRANSTAB_LO(as_nr), 0); ++ gpu_write(ptdev, AS_TRANSTAB_HI(as_nr), 0); ++ ++ gpu_write(ptdev, AS_MEMATTR_LO(as_nr), 0); ++ gpu_write(ptdev, AS_MEMATTR_HI(as_nr), 0); ++ ++ gpu_write(ptdev, AS_TRANSCFG_LO(as_nr), AS_TRANSCFG_ADRMODE_UNMAPPED); ++ gpu_write(ptdev, AS_TRANSCFG_HI(as_nr), 0); ++ ++ return write_cmd(ptdev, as_nr, AS_COMMAND_UPDATE); ++} ++ ++static u32 panthor_mmu_fault_mask(struct panthor_device *ptdev, u32 value) ++{ ++ /* Bits 16 to 31 mean REQ_COMPLETE. */ ++ return value & GENMASK(15, 0); ++} ++ ++static u32 panthor_mmu_as_fault_mask(struct panthor_device *ptdev, u32 as) ++{ ++ return BIT(as); ++} ++ ++/** ++ * panthor_vm_has_unhandled_faults() - Check if a VM has unhandled faults ++ * @vm: VM to check. ++ * ++ * Return: true if the VM has unhandled faults, false otherwise. ++ */ ++bool panthor_vm_has_unhandled_faults(struct panthor_vm *vm) ++{ ++ return vm->unhandled_fault; ++} ++ ++/** ++ * panthor_vm_is_unusable() - Check if the VM is still usable ++ * @vm: VM to check. ++ * ++ * Return: true if the VM is unusable, false otherwise. ++ */ ++bool panthor_vm_is_unusable(struct panthor_vm *vm) ++{ ++ return vm->unusable; ++} ++ ++static void panthor_vm_release_as_locked(struct panthor_vm *vm) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ ++ lockdep_assert_held(&ptdev->mmu->as.slots_lock); ++ ++ if (drm_WARN_ON(&ptdev->base, vm->as.id < 0)) ++ return; ++ ++ ptdev->mmu->as.slots[vm->as.id].vm = NULL; ++ clear_bit(vm->as.id, &ptdev->mmu->as.alloc_mask); ++ refcount_set(&vm->as.active_cnt, 0); ++ list_del_init(&vm->as.lru_node); ++ vm->as.id = -1; ++} ++ ++/** ++ * panthor_vm_active() - Flag a VM as active ++ * @VM: VM to flag as active. ++ * ++ * Assigns an address space to a VM so it can be used by the GPU/MCU. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_active(struct panthor_vm *vm) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features); ++ struct io_pgtable_cfg *cfg = &io_pgtable_ops_to_pgtable(vm->pgtbl_ops)->cfg; ++ int ret = 0, as, cookie; ++ u64 transtab, transcfg; ++ ++ if (!drm_dev_enter(&ptdev->base, &cookie)) ++ return -ENODEV; ++ ++ if (refcount_inc_not_zero(&vm->as.active_cnt)) ++ goto out_dev_exit; ++ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ ++ if (refcount_inc_not_zero(&vm->as.active_cnt)) ++ goto out_unlock; ++ ++ as = vm->as.id; ++ if (as >= 0) { ++ /* Unhandled pagefault on this AS, the MMU was disabled. We need to ++ * re-enable the MMU after clearing+unmasking the AS interrupts. ++ */ ++ if (ptdev->mmu->as.faulty_mask & panthor_mmu_as_fault_mask(ptdev, as)) ++ goto out_enable_as; ++ ++ goto out_make_active; ++ } ++ ++ /* Check for a free AS */ ++ if (vm->for_mcu) { ++ drm_WARN_ON(&ptdev->base, ptdev->mmu->as.alloc_mask & BIT(0)); ++ as = 0; ++ } else { ++ as = ffz(ptdev->mmu->as.alloc_mask | BIT(0)); ++ } ++ ++ if (!(BIT(as) & ptdev->gpu_info.as_present)) { ++ struct panthor_vm *lru_vm; ++ ++ lru_vm = list_first_entry_or_null(&ptdev->mmu->as.lru_list, ++ struct panthor_vm, ++ as.lru_node); ++ if (drm_WARN_ON(&ptdev->base, !lru_vm)) { ++ ret = -EBUSY; ++ goto out_unlock; ++ } ++ ++ drm_WARN_ON(&ptdev->base, refcount_read(&lru_vm->as.active_cnt)); ++ as = lru_vm->as.id; ++ panthor_vm_release_as_locked(lru_vm); ++ } ++ ++ /* Assign the free or reclaimed AS to the FD */ ++ vm->as.id = as; ++ set_bit(as, &ptdev->mmu->as.alloc_mask); ++ ptdev->mmu->as.slots[as].vm = vm; ++ ++out_enable_as: ++ transtab = cfg->arm_lpae_s1_cfg.ttbr; ++ transcfg = AS_TRANSCFG_PTW_MEMATTR_WB | ++ AS_TRANSCFG_PTW_RA | ++ AS_TRANSCFG_ADRMODE_AARCH64_4K | ++ AS_TRANSCFG_INA_BITS(55 - va_bits); ++ if (ptdev->coherent) ++ transcfg |= AS_TRANSCFG_PTW_SH_OS; ++ ++ /* If the VM is re-activated, we clear the fault. */ ++ vm->unhandled_fault = false; ++ ++ /* Unhandled pagefault on this AS, clear the fault and re-enable interrupts ++ * before enabling the AS. ++ */ ++ if (ptdev->mmu->as.faulty_mask & panthor_mmu_as_fault_mask(ptdev, as)) { ++ gpu_write(ptdev, MMU_INT_CLEAR, panthor_mmu_as_fault_mask(ptdev, as)); ++ ptdev->mmu->as.faulty_mask &= ~panthor_mmu_as_fault_mask(ptdev, as); ++ gpu_write(ptdev, MMU_INT_MASK, ~ptdev->mmu->as.faulty_mask); ++ } ++ ++ ret = panthor_mmu_as_enable(vm->ptdev, vm->as.id, transtab, transcfg, vm->memattr); ++ ++out_make_active: ++ if (!ret) { ++ refcount_set(&vm->as.active_cnt, 1); ++ list_del_init(&vm->as.lru_node); ++ } ++ ++out_unlock: ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++out_dev_exit: ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++/** ++ * panthor_vm_idle() - Flag a VM idle ++ * @VM: VM to flag as idle. ++ * ++ * When we know the GPU is done with the VM (no more jobs to process), ++ * we can relinquish the AS slot attached to this VM, if any. ++ * ++ * We don't release the slot immediately, but instead place the VM in ++ * the LRU list, so it can be evicted if another VM needs an AS slot. ++ * This way, VMs keep attached to the AS they were given until we run ++ * out of free slot, limiting the number of MMU operations (TLB flush ++ * and other AS updates). ++ */ ++void panthor_vm_idle(struct panthor_vm *vm) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ ++ if (!refcount_dec_and_mutex_lock(&vm->as.active_cnt, &ptdev->mmu->as.slots_lock)) ++ return; ++ ++ if (!drm_WARN_ON(&ptdev->base, vm->as.id == -1 || !list_empty(&vm->as.lru_node))) ++ list_add_tail(&vm->as.lru_node, &ptdev->mmu->as.lru_list); ++ ++ refcount_set(&vm->as.active_cnt, 0); ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++} ++ ++static void panthor_vm_stop(struct panthor_vm *vm) ++{ ++ drm_sched_stop(&vm->sched, NULL); ++} ++ ++static void panthor_vm_start(struct panthor_vm *vm) ++{ ++ drm_sched_start(&vm->sched, true); ++} ++ ++/** ++ * panthor_vm_as() - Get the AS slot attached to a VM ++ * @vm: VM to get the AS slot of. ++ * ++ * Return: -1 if the VM is not assigned an AS slot yet, >= 0 otherwise. ++ */ ++int panthor_vm_as(struct panthor_vm *vm) ++{ ++ return vm->as.id; ++} ++ ++static size_t get_pgsize(u64 addr, size_t size, size_t *count) ++{ ++ /* ++ * io-pgtable only operates on multiple pages within a single table ++ * entry, so we need to split at boundaries of the table size, i.e. ++ * the next block size up. The distance from address A to the next ++ * boundary of block size B is logically B - A % B, but in unsigned ++ * two's complement where B is a power of two we get the equivalence ++ * B - A % B == (B - A) % B == (n * B - A) % B, and choose n = 0 :) ++ */ ++ size_t blk_offset = -addr % SZ_2M; ++ ++ if (blk_offset || size < SZ_2M) { ++ *count = min_not_zero(blk_offset, size) / SZ_4K; ++ return SZ_4K; ++ } ++ blk_offset = -addr % SZ_1G ?: SZ_1G; ++ *count = min(blk_offset, size) / SZ_2M; ++ return SZ_2M; ++} ++ ++static int panthor_vm_flush_range(struct panthor_vm *vm, u64 iova, u64 size) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ int ret = 0, cookie; ++ ++ if (vm->as.id < 0) ++ return 0; ++ ++ /* If the device is unplugged, we just silently skip the flush. */ ++ if (!drm_dev_enter(&ptdev->base, &cookie)) ++ return 0; ++ ++ /* Flush the PTs only if we're already awake */ ++ if (pm_runtime_active(ptdev->base.dev)) ++ ret = mmu_hw_do_operation(vm, iova, size, AS_COMMAND_FLUSH_PT); ++ ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static int panthor_vm_unmap_pages(struct panthor_vm *vm, u64 iova, u64 size) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ struct io_pgtable_ops *ops = vm->pgtbl_ops; ++ u64 offset = 0; ++ ++ drm_dbg(&ptdev->base, "unmap: as=%d, iova=%llx, len=%llx", vm->as.id, iova, size); ++ ++ while (offset < size) { ++ size_t unmapped_sz = 0, pgcount; ++ size_t pgsize = get_pgsize(iova + offset, size - offset, &pgcount); ++ ++ unmapped_sz = ops->unmap_pages(ops, iova + offset, pgsize, pgcount, NULL); ++ ++ if (drm_WARN_ON(&ptdev->base, unmapped_sz != pgsize * pgcount)) { ++ drm_err(&ptdev->base, "failed to unmap range %llx-%llx (requested range %llx-%llx)\n", ++ iova + offset + unmapped_sz, ++ iova + offset + pgsize * pgcount, ++ iova, iova + size); ++ panthor_vm_flush_range(vm, iova, offset + unmapped_sz); ++ return -EINVAL; ++ } ++ offset += unmapped_sz; ++ } ++ ++ return panthor_vm_flush_range(vm, iova, size); ++} ++ ++static int ++panthor_vm_map_pages(struct panthor_vm *vm, u64 iova, int prot, ++ struct sg_table *sgt, u64 offset, u64 size) ++{ ++ struct panthor_device *ptdev = vm->ptdev; ++ unsigned int count; ++ struct scatterlist *sgl; ++ struct io_pgtable_ops *ops = vm->pgtbl_ops; ++ u64 start_iova = iova; ++ int ret; ++ ++ if (!size) ++ return 0; ++ ++ for_each_sgtable_dma_sg(sgt, sgl, count) { ++ dma_addr_t paddr = sg_dma_address(sgl); ++ size_t len = sg_dma_len(sgl); ++ ++ if (len <= offset) { ++ offset -= len; ++ continue; ++ } ++ ++ paddr += offset; ++ len -= offset; ++ len = min_t(size_t, len, size); ++ size -= len; ++ ++ drm_dbg(&ptdev->base, "map: as=%d, iova=%llx, paddr=%pad, len=%zx", ++ vm->as.id, iova, &paddr, len); ++ ++ while (len) { ++ size_t pgcount, mapped = 0; ++ size_t pgsize = get_pgsize(iova | paddr, len, &pgcount); ++ ++ ret = ops->map_pages(ops, iova, paddr, pgsize, pgcount, prot, ++ GFP_KERNEL, &mapped); ++ iova += mapped; ++ paddr += mapped; ++ len -= mapped; ++ ++ if (drm_WARN_ON(&ptdev->base, !ret && !mapped)) ++ ret = -ENOMEM; ++ ++ if (ret) { ++ /* If something failed, unmap what we've already mapped before ++ * returning. The unmap call is not supposed to fail. ++ */ ++ drm_WARN_ON(&ptdev->base, ++ panthor_vm_unmap_pages(vm, start_iova, ++ iova - start_iova)); ++ return ret; ++ } ++ } ++ ++ if (!size) ++ break; ++ } ++ ++ return panthor_vm_flush_range(vm, start_iova, iova - start_iova); ++} ++ ++static int flags_to_prot(u32 flags) ++{ ++ int prot = 0; ++ ++ if (flags & DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC) ++ prot |= IOMMU_NOEXEC; ++ ++ if (!(flags & DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED)) ++ prot |= IOMMU_CACHE; ++ ++ if (flags & DRM_PANTHOR_VM_BIND_OP_MAP_READONLY) ++ prot |= IOMMU_READ; ++ else ++ prot |= IOMMU_READ | IOMMU_WRITE; ++ ++ return prot; ++} ++ ++/** ++ * panthor_vm_alloc_va() - Allocate a region in the auto-va space ++ * @VM: VM to allocate a region on. ++ * @va: start of the VA range. Can be PANTHOR_VM_KERNEL_AUTO_VA if the user ++ * wants the VA to be automatically allocated from the auto-VA range. ++ * @size: size of the VA range. ++ * @va_node: drm_mm_node to initialize. Must be zero-initialized. ++ * ++ * Some GPU objects, like heap chunks, are fully managed by the kernel and ++ * need to be mapped to the userspace VM, in the region reserved for kernel ++ * objects. ++ * ++ * This function takes care of allocating a region in the kernel auto-VA space. ++ * ++ * Return: 0 on success, an error code otherwise. ++ */ ++int ++panthor_vm_alloc_va(struct panthor_vm *vm, u64 va, u64 size, ++ struct drm_mm_node *va_node) ++{ ++ int ret; ++ ++ if (!size || (size & ~PAGE_MASK)) ++ return -EINVAL; ++ ++ if (va != PANTHOR_VM_KERNEL_AUTO_VA && (va & ~PAGE_MASK)) ++ return -EINVAL; ++ ++ mutex_lock(&vm->mm_lock); ++ if (va != PANTHOR_VM_KERNEL_AUTO_VA) { ++ va_node->start = va; ++ va_node->size = size; ++ ret = drm_mm_reserve_node(&vm->mm, va_node); ++ } else { ++ ret = drm_mm_insert_node_in_range(&vm->mm, va_node, size, ++ size >= SZ_2M ? SZ_2M : SZ_4K, ++ 0, vm->kernel_auto_va.start, ++ vm->kernel_auto_va.end, ++ DRM_MM_INSERT_BEST); ++ } ++ mutex_unlock(&vm->mm_lock); ++ ++ return ret; ++} ++ ++/** ++ * panthor_vm_free_va() - Free a region allocated with panthor_vm_alloc_va() ++ * @VM: VM to free the region on. ++ * @va_node: Memory node representing the region to free. ++ */ ++void panthor_vm_free_va(struct panthor_vm *vm, struct drm_mm_node *va_node) ++{ ++ mutex_lock(&vm->mm_lock); ++ drm_mm_remove_node(va_node); ++ mutex_unlock(&vm->mm_lock); ++} ++ ++static void panthor_vm_bo_put(struct drm_gpuvm_bo *vm_bo) ++{ ++ struct panthor_gem_object *bo = to_panthor_bo(vm_bo->obj); ++ struct drm_gpuvm *vm = vm_bo->vm; ++ bool unpin; ++ ++ /* We must retain the GEM before calling drm_gpuvm_bo_put(), ++ * otherwise the mutex might be destroyed while we hold it. ++ * Same goes for the VM, since we take the VM resv lock. ++ */ ++ drm_gem_object_get(&bo->base.base); ++ drm_gpuvm_get(vm); ++ ++ /* We take the resv lock to protect against concurrent accesses to the ++ * gpuvm evicted/extobj lists that are modified in ++ * drm_gpuvm_bo_destroy(), which is called if drm_gpuvm_bo_put() ++ * releases sthe last vm_bo reference. ++ * We take the BO GPUVA list lock to protect the vm_bo removal from the ++ * GEM vm_bo list. ++ */ ++ dma_resv_lock(drm_gpuvm_resv(vm), NULL); ++ mutex_lock(&bo->gpuva_list_lock); ++ unpin = drm_gpuvm_bo_put(vm_bo); ++ mutex_unlock(&bo->gpuva_list_lock); ++ dma_resv_unlock(drm_gpuvm_resv(vm)); ++ ++ /* If the vm_bo object was destroyed, release the pin reference that ++ * was hold by this object. ++ */ ++ if (unpin && !bo->base.base.import_attach) ++ drm_gem_shmem_unpin(&bo->base); ++ ++ drm_gpuvm_put(vm); ++ drm_gem_object_put(&bo->base.base); ++} ++ ++static void panthor_vm_cleanup_op_ctx(struct panthor_vm_op_ctx *op_ctx, ++ struct panthor_vm *vm) ++{ ++ struct panthor_vma *vma, *tmp_vma; ++ ++ u32 remaining_pt_count = op_ctx->rsvd_page_tables.count - ++ op_ctx->rsvd_page_tables.ptr; ++ ++ if (remaining_pt_count) { ++ kmem_cache_free_bulk(pt_cache, remaining_pt_count, ++ op_ctx->rsvd_page_tables.pages + ++ op_ctx->rsvd_page_tables.ptr); ++ } ++ ++ kfree(op_ctx->rsvd_page_tables.pages); ++ ++ if (op_ctx->map.vm_bo) ++ panthor_vm_bo_put(op_ctx->map.vm_bo); ++ ++ for (u32 i = 0; i < ARRAY_SIZE(op_ctx->preallocated_vmas); i++) ++ kfree(op_ctx->preallocated_vmas[i]); ++ ++ list_for_each_entry_safe(vma, tmp_vma, &op_ctx->returned_vmas, node) { ++ list_del(&vma->node); ++ panthor_vm_bo_put(vma->base.vm_bo); ++ kfree(vma); ++ } ++} ++ ++static struct panthor_vma * ++panthor_vm_op_ctx_get_vma(struct panthor_vm_op_ctx *op_ctx) ++{ ++ for (u32 i = 0; i < ARRAY_SIZE(op_ctx->preallocated_vmas); i++) { ++ struct panthor_vma *vma = op_ctx->preallocated_vmas[i]; ++ ++ if (vma) { ++ op_ctx->preallocated_vmas[i] = NULL; ++ return vma; ++ } ++ } ++ ++ return NULL; ++} ++ ++static int ++panthor_vm_op_ctx_prealloc_vmas(struct panthor_vm_op_ctx *op_ctx) ++{ ++ u32 vma_count; ++ ++ switch (op_ctx->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) { ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: ++ /* One VMA for the new mapping, and two more VMAs for the remap case ++ * which might contain both a prev and next VA. ++ */ ++ vma_count = 3; ++ break; ++ ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP: ++ /* Partial unmaps might trigger a remap with either a prev or a next VA, ++ * but not both. ++ */ ++ vma_count = 1; ++ break; ++ ++ default: ++ return 0; ++ } ++ ++ for (u32 i = 0; i < vma_count; i++) { ++ struct panthor_vma *vma = kzalloc(sizeof(*vma), GFP_KERNEL); ++ ++ if (!vma) ++ return -ENOMEM; ++ ++ op_ctx->preallocated_vmas[i] = vma; ++ } ++ ++ return 0; ++} ++ ++#define PANTHOR_VM_BIND_OP_MAP_FLAGS \ ++ (DRM_PANTHOR_VM_BIND_OP_MAP_READONLY | \ ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | \ ++ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED | \ ++ DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) ++ ++static int panthor_vm_prepare_map_op_ctx(struct panthor_vm_op_ctx *op_ctx, ++ struct panthor_vm *vm, ++ struct panthor_gem_object *bo, ++ u64 offset, ++ u64 size, u64 va, ++ u32 flags) ++{ ++ struct drm_gpuvm_bo *preallocated_vm_bo; ++ struct sg_table *sgt = NULL; ++ u64 pt_count; ++ int ret; ++ ++ if (!bo) ++ return -EINVAL; ++ ++ if ((flags & ~PANTHOR_VM_BIND_OP_MAP_FLAGS) || ++ (flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) != DRM_PANTHOR_VM_BIND_OP_TYPE_MAP) ++ return -EINVAL; ++ ++ /* Make sure the VA and size are aligned and in-bounds. */ ++ if (size > bo->base.base.size || offset > bo->base.base.size - size) ++ return -EINVAL; ++ ++ /* If the BO has an exclusive VM attached, it can't be mapped to other VMs. */ ++ if (bo->exclusive_vm_root_gem && ++ bo->exclusive_vm_root_gem != panthor_vm_root_gem(vm)) ++ return -EINVAL; ++ ++ memset(op_ctx, 0, sizeof(*op_ctx)); ++ INIT_LIST_HEAD(&op_ctx->returned_vmas); ++ op_ctx->flags = flags; ++ op_ctx->va.range = size; ++ op_ctx->va.addr = va; ++ ++ ret = panthor_vm_op_ctx_prealloc_vmas(op_ctx); ++ if (ret) ++ goto err_cleanup; ++ ++ if (!bo->base.base.import_attach) { ++ /* Pre-reserve the BO pages, so the map operation doesn't have to ++ * allocate. ++ */ ++ ret = drm_gem_shmem_pin(&bo->base); ++ if (ret) ++ goto err_cleanup; ++ } ++ ++ sgt = drm_gem_shmem_get_pages_sgt(&bo->base); ++ if (IS_ERR(sgt)) { ++ if (!bo->base.base.import_attach) ++ drm_gem_shmem_unpin(&bo->base); ++ ++ ret = PTR_ERR(sgt); ++ goto err_cleanup; ++ } ++ ++ op_ctx->map.sgt = sgt; ++ ++ preallocated_vm_bo = drm_gpuvm_bo_create(&vm->base, &bo->base.base); ++ if (!preallocated_vm_bo) { ++ if (!bo->base.base.import_attach) ++ drm_gem_shmem_unpin(&bo->base); ++ ++ ret = -ENOMEM; ++ goto err_cleanup; ++ } ++ ++ mutex_lock(&bo->gpuva_list_lock); ++ op_ctx->map.vm_bo = drm_gpuvm_bo_obtain_prealloc(preallocated_vm_bo); ++ mutex_unlock(&bo->gpuva_list_lock); ++ ++ /* If the a vm_bo for this combination exists, it already ++ * retains a pin ref, and we can release the one we took earlier. ++ * ++ * If our pre-allocated vm_bo is picked, it now retains the pin ref, ++ * which will be released in panthor_vm_bo_put(). ++ */ ++ if (preallocated_vm_bo != op_ctx->map.vm_bo && ++ !bo->base.base.import_attach) ++ drm_gem_shmem_unpin(&bo->base); ++ ++ op_ctx->map.bo_offset = offset; ++ ++ /* L1, L2 and L3 page tables. ++ * We could optimize L3 allocation by iterating over the sgt and merging ++ * 2M contiguous blocks, but it's simpler to over-provision and return ++ * the pages if they're not used. ++ */ ++ pt_count = ((ALIGN(va + size, 1ull << 39) - ALIGN_DOWN(va, 1ull << 39)) >> 39) + ++ ((ALIGN(va + size, 1ull << 30) - ALIGN_DOWN(va, 1ull << 30)) >> 30) + ++ ((ALIGN(va + size, 1ull << 21) - ALIGN_DOWN(va, 1ull << 21)) >> 21); ++ ++ op_ctx->rsvd_page_tables.pages = kcalloc(pt_count, ++ sizeof(*op_ctx->rsvd_page_tables.pages), ++ GFP_KERNEL); ++ if (!op_ctx->rsvd_page_tables.pages) ++ goto err_cleanup; ++ ++ ret = kmem_cache_alloc_bulk(pt_cache, GFP_KERNEL, pt_count, ++ op_ctx->rsvd_page_tables.pages); ++ op_ctx->rsvd_page_tables.count = ret; ++ if (ret != pt_count) { ++ ret = -ENOMEM; ++ goto err_cleanup; ++ } ++ ++ /* Insert BO into the extobj list last, when we know nothing can fail. */ ++ dma_resv_lock(panthor_vm_resv(vm), NULL); ++ drm_gpuvm_bo_extobj_add(op_ctx->map.vm_bo); ++ dma_resv_unlock(panthor_vm_resv(vm)); ++ ++ return 0; ++ ++err_cleanup: ++ panthor_vm_cleanup_op_ctx(op_ctx, vm); ++ return ret; ++} ++ ++static int panthor_vm_prepare_unmap_op_ctx(struct panthor_vm_op_ctx *op_ctx, ++ struct panthor_vm *vm, ++ u64 va, u64 size) ++{ ++ u32 pt_count = 0; ++ int ret; ++ ++ memset(op_ctx, 0, sizeof(*op_ctx)); ++ INIT_LIST_HEAD(&op_ctx->returned_vmas); ++ op_ctx->va.range = size; ++ op_ctx->va.addr = va; ++ op_ctx->flags = DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP; ++ ++ /* Pre-allocate L3 page tables to account for the split-2M-block ++ * situation on unmap. ++ */ ++ if (va != ALIGN(va, SZ_2M)) ++ pt_count++; ++ ++ if (va + size != ALIGN(va + size, SZ_2M) && ++ ALIGN(va + size, SZ_2M) != ALIGN(va, SZ_2M)) ++ pt_count++; ++ ++ ret = panthor_vm_op_ctx_prealloc_vmas(op_ctx); ++ if (ret) ++ goto err_cleanup; ++ ++ if (pt_count) { ++ op_ctx->rsvd_page_tables.pages = kcalloc(pt_count, ++ sizeof(*op_ctx->rsvd_page_tables.pages), ++ GFP_KERNEL); ++ if (!op_ctx->rsvd_page_tables.pages) ++ goto err_cleanup; ++ ++ ret = kmem_cache_alloc_bulk(pt_cache, GFP_KERNEL, pt_count, ++ op_ctx->rsvd_page_tables.pages); ++ if (ret != pt_count) { ++ ret = -ENOMEM; ++ goto err_cleanup; ++ } ++ op_ctx->rsvd_page_tables.count = pt_count; ++ } ++ ++ return 0; ++ ++err_cleanup: ++ panthor_vm_cleanup_op_ctx(op_ctx, vm); ++ return ret; ++} ++ ++static void panthor_vm_prepare_sync_only_op_ctx(struct panthor_vm_op_ctx *op_ctx, ++ struct panthor_vm *vm) ++{ ++ memset(op_ctx, 0, sizeof(*op_ctx)); ++ INIT_LIST_HEAD(&op_ctx->returned_vmas); ++ op_ctx->flags = DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY; ++} ++ ++/** ++ * panthor_vm_get_bo_for_va() - Get the GEM object mapped at a virtual address ++ * @vm: VM to look into. ++ * @va: Virtual address to search for. ++ * @bo_offset: Offset of the GEM object mapped at this virtual address. ++ * Only valid on success. ++ * ++ * The object returned by this function might no longer be mapped when the ++ * function returns. It's the caller responsibility to ensure there's no ++ * concurrent map/unmap operations making the returned value invalid, or ++ * make sure it doesn't matter if the object is no longer mapped. ++ * ++ * Return: A valid pointer on success, an ERR_PTR() otherwise. ++ */ ++struct panthor_gem_object * ++panthor_vm_get_bo_for_va(struct panthor_vm *vm, u64 va, u64 *bo_offset) ++{ ++ struct panthor_gem_object *bo = ERR_PTR(-ENOENT); ++ struct drm_gpuva *gpuva; ++ struct panthor_vma *vma; ++ ++ /* Take the VM lock to prevent concurrent map/unmap operations. */ ++ mutex_lock(&vm->op_lock); ++ gpuva = drm_gpuva_find_first(&vm->base, va, 1); ++ vma = gpuva ? container_of(gpuva, struct panthor_vma, base) : NULL; ++ if (vma && vma->base.gem.obj) { ++ drm_gem_object_get(vma->base.gem.obj); ++ bo = to_panthor_bo(vma->base.gem.obj); ++ *bo_offset = vma->base.gem.offset + (va - vma->base.va.addr); ++ } ++ mutex_unlock(&vm->op_lock); ++ ++ return bo; ++} ++ ++#define PANTHOR_VM_MIN_KERNEL_VA_SIZE SZ_256M ++ ++static u64 ++panthor_vm_create_get_user_va_range(const struct drm_panthor_vm_create *args, ++ u64 full_va_range) ++{ ++ u64 user_va_range; ++ ++ /* Make sure we have a minimum amount of VA space for kernel objects. */ ++ if (full_va_range < PANTHOR_VM_MIN_KERNEL_VA_SIZE) ++ return 0; ++ ++ if (args->user_va_range) { ++ /* Use the user provided value if != 0. */ ++ user_va_range = args->user_va_range; ++ } else if (TASK_SIZE_OF(current) < full_va_range) { ++ /* If the task VM size is smaller than the GPU VA range, pick this ++ * as our default user VA range, so userspace can CPU/GPU map buffers ++ * at the same address. ++ */ ++ user_va_range = TASK_SIZE_OF(current); ++ } else { ++ /* If the GPU VA range is smaller than the task VM size, we ++ * just have to live with the fact we won't be able to map ++ * all buffers at the same GPU/CPU address. ++ * ++ * If the GPU VA range is bigger than 4G (more than 32-bit of ++ * VA), we split the range in two, and assign half of it to ++ * the user and the other half to the kernel, if it's not, we ++ * keep the kernel VA space as small as possible. ++ */ ++ user_va_range = full_va_range > SZ_4G ? ++ full_va_range / 2 : ++ full_va_range - PANTHOR_VM_MIN_KERNEL_VA_SIZE; ++ } ++ ++ if (full_va_range - PANTHOR_VM_MIN_KERNEL_VA_SIZE < user_va_range) ++ user_va_range = full_va_range - PANTHOR_VM_MIN_KERNEL_VA_SIZE; ++ ++ return user_va_range; ++} ++ ++#define PANTHOR_VM_CREATE_FLAGS 0 ++ ++static int ++panthor_vm_create_check_args(const struct panthor_device *ptdev, ++ const struct drm_panthor_vm_create *args, ++ u64 *kernel_va_start, u64 *kernel_va_range) ++{ ++ u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features); ++ u64 full_va_range = 1ull << va_bits; ++ u64 user_va_range; ++ ++ if (args->flags & ~PANTHOR_VM_CREATE_FLAGS) ++ return -EINVAL; ++ ++ user_va_range = panthor_vm_create_get_user_va_range(args, full_va_range); ++ if (!user_va_range || (args->user_va_range && args->user_va_range > user_va_range)) ++ return -EINVAL; ++ ++ /* Pick a kernel VA range that's a power of two, to have a clear split. */ ++ *kernel_va_range = rounddown_pow_of_two(full_va_range - user_va_range); ++ *kernel_va_start = full_va_range - *kernel_va_range; ++ return 0; ++} ++ ++/* ++ * Only 32 VMs per open file. If that becomes a limiting factor, we can ++ * increase this number. ++ */ ++#define PANTHOR_MAX_VMS_PER_FILE 32 ++ ++/** ++ * panthor_vm_pool_create_vm() - Create a VM ++ * @pool: The VM to create this VM on. ++ * @kernel_va_start: Start of the region reserved for kernel objects. ++ * @kernel_va_range: Size of the region reserved for kernel objects. ++ * ++ * Return: a positive VM ID on success, a negative error code otherwise. ++ */ ++int panthor_vm_pool_create_vm(struct panthor_device *ptdev, ++ struct panthor_vm_pool *pool, ++ struct drm_panthor_vm_create *args) ++{ ++ u64 kernel_va_start, kernel_va_range; ++ struct panthor_vm *vm; ++ int ret; ++ u32 id; ++ ++ ret = panthor_vm_create_check_args(ptdev, args, &kernel_va_start, &kernel_va_range); ++ if (ret) ++ return ret; ++ ++ vm = panthor_vm_create(ptdev, false, kernel_va_start, kernel_va_range, ++ kernel_va_start, kernel_va_range); ++ if (IS_ERR(vm)) ++ return PTR_ERR(vm); ++ ++ ret = xa_alloc(&pool->xa, &id, vm, ++ XA_LIMIT(1, PANTHOR_MAX_VMS_PER_FILE), GFP_KERNEL); ++ ++ if (ret) { ++ panthor_vm_put(vm); ++ return ret; ++ } ++ ++ args->user_va_range = kernel_va_start; ++ return id; ++} ++ ++static void panthor_vm_destroy(struct panthor_vm *vm) ++{ ++ if (!vm) ++ return; ++ ++ vm->destroyed = true; ++ ++ mutex_lock(&vm->heaps.lock); ++ panthor_heap_pool_destroy(vm->heaps.pool); ++ vm->heaps.pool = NULL; ++ mutex_unlock(&vm->heaps.lock); ++ ++ drm_WARN_ON(&vm->ptdev->base, ++ panthor_vm_unmap_range(vm, vm->base.mm_start, vm->base.mm_range)); ++ panthor_vm_put(vm); ++} ++ ++/** ++ * panthor_vm_pool_destroy_vm() - Destroy a VM. ++ * @pool: VM pool. ++ * @handle: VM handle. ++ * ++ * This function doesn't free the VM object or its resources, it just kills ++ * all mappings, and makes sure nothing can be mapped after that point. ++ * ++ * If there was any active jobs at the time this function is called, these ++ * jobs should experience page faults and be killed as a result. ++ * ++ * The VM resources are freed when the last reference on the VM object is ++ * dropped. ++ */ ++int panthor_vm_pool_destroy_vm(struct panthor_vm_pool *pool, u32 handle) ++{ ++ struct panthor_vm *vm; ++ ++ vm = xa_erase(&pool->xa, handle); ++ ++ panthor_vm_destroy(vm); ++ ++ return vm ? 0 : -EINVAL; ++} ++ ++/** ++ * panthor_vm_pool_get_vm() - Retrieve VM object bound to a VM handle ++ * @pool: VM pool to check. ++ * @handle: Handle of the VM to retrieve. ++ * ++ * Return: A valid pointer if the VM exists, NULL otherwise. ++ */ ++struct panthor_vm * ++panthor_vm_pool_get_vm(struct panthor_vm_pool *pool, u32 handle) ++{ ++ struct panthor_vm *vm; ++ ++ vm = panthor_vm_get(xa_load(&pool->xa, handle)); ++ ++ return vm; ++} ++ ++/** ++ * panthor_vm_pool_destroy() - Destroy a VM pool. ++ * @pfile: File. ++ * ++ * Destroy all VMs in the pool, and release the pool resources. ++ * ++ * Note that VMs can outlive the pool they were created from if other ++ * objects hold a reference to there VMs. ++ */ ++void panthor_vm_pool_destroy(struct panthor_file *pfile) ++{ ++ struct panthor_vm *vm; ++ unsigned long i; ++ ++ if (!pfile->vms) ++ return; ++ ++ xa_for_each(&pfile->vms->xa, i, vm) ++ panthor_vm_destroy(vm); ++ ++ xa_destroy(&pfile->vms->xa); ++ kfree(pfile->vms); ++} ++ ++/** ++ * panthor_vm_pool_create() - Create a VM pool ++ * @pfile: File. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_pool_create(struct panthor_file *pfile) ++{ ++ pfile->vms = kzalloc(sizeof(*pfile->vms), GFP_KERNEL); ++ if (!pfile->vms) ++ return -ENOMEM; ++ ++ xa_init_flags(&pfile->vms->xa, XA_FLAGS_ALLOC1); ++ return 0; ++} ++ ++/* dummy TLB ops, the real TLB flush happens in panthor_vm_flush_range() */ ++static void mmu_tlb_flush_all(void *cookie) ++{ ++} ++ ++static void mmu_tlb_flush_walk(unsigned long iova, size_t size, size_t granule, void *cookie) ++{ ++} ++ ++static const struct iommu_flush_ops mmu_tlb_ops = { ++ .tlb_flush_all = mmu_tlb_flush_all, ++ .tlb_flush_walk = mmu_tlb_flush_walk, ++}; ++ ++static const char *access_type_name(struct panthor_device *ptdev, ++ u32 fault_status) ++{ ++ switch (fault_status & AS_FAULTSTATUS_ACCESS_TYPE_MASK) { ++ case AS_FAULTSTATUS_ACCESS_TYPE_ATOMIC: ++ return "ATOMIC"; ++ case AS_FAULTSTATUS_ACCESS_TYPE_READ: ++ return "READ"; ++ case AS_FAULTSTATUS_ACCESS_TYPE_WRITE: ++ return "WRITE"; ++ case AS_FAULTSTATUS_ACCESS_TYPE_EX: ++ return "EXECUTE"; ++ default: ++ drm_WARN_ON(&ptdev->base, 1); ++ return NULL; ++ } ++} ++ ++static void panthor_mmu_irq_handler(struct panthor_device *ptdev, u32 status) ++{ ++ bool has_unhandled_faults = false; ++ ++ status = panthor_mmu_fault_mask(ptdev, status); ++ while (status) { ++ u32 as = ffs(status | (status >> 16)) - 1; ++ u32 mask = panthor_mmu_as_fault_mask(ptdev, as); ++ u32 new_int_mask; ++ u64 addr; ++ u32 fault_status; ++ u32 exception_type; ++ u32 access_type; ++ u32 source_id; ++ ++ fault_status = gpu_read(ptdev, AS_FAULTSTATUS(as)); ++ addr = gpu_read(ptdev, AS_FAULTADDRESS_LO(as)); ++ addr |= (u64)gpu_read(ptdev, AS_FAULTADDRESS_HI(as)) << 32; ++ ++ /* decode the fault status */ ++ exception_type = fault_status & 0xFF; ++ access_type = (fault_status >> 8) & 0x3; ++ source_id = (fault_status >> 16); ++ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ ++ ptdev->mmu->as.faulty_mask |= mask; ++ new_int_mask = ++ panthor_mmu_fault_mask(ptdev, ~ptdev->mmu->as.faulty_mask); ++ ++ /* terminal fault, print info about the fault */ ++ drm_err(&ptdev->base, ++ "Unhandled Page fault in AS%d at VA 0x%016llX\n" ++ "raw fault status: 0x%X\n" ++ "decoded fault status: %s\n" ++ "exception type 0x%X: %s\n" ++ "access type 0x%X: %s\n" ++ "source id 0x%X\n", ++ as, addr, ++ fault_status, ++ (fault_status & (1 << 10) ? "DECODER FAULT" : "SLAVE FAULT"), ++ exception_type, panthor_exception_name(ptdev, exception_type), ++ access_type, access_type_name(ptdev, fault_status), ++ source_id); ++ ++ /* Ignore MMU interrupts on this AS until it's been ++ * re-enabled. ++ */ ++ ptdev->mmu->irq.mask = new_int_mask; ++ gpu_write(ptdev, MMU_INT_MASK, new_int_mask); ++ ++ if (ptdev->mmu->as.slots[as].vm) ++ ptdev->mmu->as.slots[as].vm->unhandled_fault = true; ++ ++ /* Disable the MMU to kill jobs on this AS. */ ++ panthor_mmu_as_disable(ptdev, as); ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++ status &= ~mask; ++ has_unhandled_faults = true; ++ } ++ ++ if (has_unhandled_faults) ++ panthor_sched_report_mmu_fault(ptdev); ++} ++PANTHOR_IRQ_HANDLER(mmu, MMU, panthor_mmu_irq_handler); ++ ++/** ++ * panthor_mmu_suspend() - Suspend the MMU logic ++ * @ptdev: Device. ++ * ++ * All we do here is de-assign the AS slots on all active VMs, so things ++ * get flushed to the main memory, and no further access to these VMs are ++ * possible. ++ * ++ * We also suspend the MMU IRQ. ++ */ ++void panthor_mmu_suspend(struct panthor_device *ptdev) ++{ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ for (u32 i = 0; i < ARRAY_SIZE(ptdev->mmu->as.slots); i++) { ++ struct panthor_vm *vm = ptdev->mmu->as.slots[i].vm; ++ ++ if (vm) { ++ drm_WARN_ON(&ptdev->base, panthor_mmu_as_disable(ptdev, i)); ++ panthor_vm_release_as_locked(vm); ++ } ++ } ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++ panthor_mmu_irq_suspend(&ptdev->mmu->irq); ++} ++ ++/** ++ * panthor_mmu_resume() - Resume the MMU logic ++ * @ptdev: Device. ++ * ++ * Resume the IRQ. ++ * ++ * We don't re-enable previously active VMs. We assume other parts of the ++ * driver will call panthor_vm_active() on the VMs they intend to use. ++ */ ++void panthor_mmu_resume(struct panthor_device *ptdev) ++{ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ ptdev->mmu->as.alloc_mask = 0; ++ ptdev->mmu->as.faulty_mask = 0; ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++ panthor_mmu_irq_resume(&ptdev->mmu->irq, panthor_mmu_fault_mask(ptdev, ~0)); ++} ++ ++/** ++ * panthor_mmu_pre_reset() - Prepare for a reset ++ * @ptdev: Device. ++ * ++ * Suspend the IRQ, and make sure all VM_BIND queues are stopped, so we ++ * don't get asked to do a VM operation while the GPU is down. ++ * ++ * We don't cleanly shutdown the AS slots here, because the reset might ++ * come from an AS_ACTIVE_BIT stuck situation. ++ */ ++void panthor_mmu_pre_reset(struct panthor_device *ptdev) ++{ ++ struct panthor_vm *vm; ++ ++ panthor_mmu_irq_suspend(&ptdev->mmu->irq); ++ ++ mutex_lock(&ptdev->mmu->vm.lock); ++ ptdev->mmu->vm.reset_in_progress = true; ++ list_for_each_entry(vm, &ptdev->mmu->vm.list, node) ++ panthor_vm_stop(vm); ++ mutex_unlock(&ptdev->mmu->vm.lock); ++} ++ ++/** ++ * panthor_mmu_post_reset() - Restore things after a reset ++ * @ptdev: Device. ++ * ++ * Put the MMU logic back in action after a reset. That implies resuming the ++ * IRQ and re-enabling the VM_BIND queues. ++ */ ++void panthor_mmu_post_reset(struct panthor_device *ptdev) ++{ ++ struct panthor_vm *vm; ++ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ ++ /* Now that the reset is effective, we can assume that none of the ++ * AS slots are setup, and clear the faulty flags too. ++ */ ++ ptdev->mmu->as.alloc_mask = 0; ++ ptdev->mmu->as.faulty_mask = 0; ++ ++ for (u32 i = 0; i < ARRAY_SIZE(ptdev->mmu->as.slots); i++) { ++ struct panthor_vm *vm = ptdev->mmu->as.slots[i].vm; ++ ++ if (vm) ++ panthor_vm_release_as_locked(vm); ++ } ++ ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++ panthor_mmu_irq_resume(&ptdev->mmu->irq, panthor_mmu_fault_mask(ptdev, ~0)); ++ ++ /* Restart the VM_BIND queues. */ ++ mutex_lock(&ptdev->mmu->vm.lock); ++ list_for_each_entry(vm, &ptdev->mmu->vm.list, node) { ++ panthor_vm_start(vm); ++ } ++ ptdev->mmu->vm.reset_in_progress = false; ++ mutex_unlock(&ptdev->mmu->vm.lock); ++} ++ ++static void panthor_vm_free(struct drm_gpuvm *gpuvm) ++{ ++ struct panthor_vm *vm = container_of(gpuvm, struct panthor_vm, base); ++ struct panthor_device *ptdev = vm->ptdev; ++ ++ mutex_lock(&vm->heaps.lock); ++ if (drm_WARN_ON(&ptdev->base, vm->heaps.pool)) ++ panthor_heap_pool_destroy(vm->heaps.pool); ++ mutex_unlock(&vm->heaps.lock); ++ mutex_destroy(&vm->heaps.lock); ++ ++ mutex_lock(&ptdev->mmu->vm.lock); ++ list_del(&vm->node); ++ /* Restore the scheduler state so we can call drm_sched_entity_destroy() ++ * and drm_sched_fini(). If get there, that means we have no job left ++ * and no new jobs can be queued, so we can start the scheduler without ++ * risking interfering with the reset. ++ */ ++ if (ptdev->mmu->vm.reset_in_progress) ++ panthor_vm_start(vm); ++ mutex_unlock(&ptdev->mmu->vm.lock); ++ ++ drm_sched_entity_destroy(&vm->entity); ++ drm_sched_fini(&vm->sched); ++ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ if (vm->as.id >= 0) { ++ int cookie; ++ ++ if (drm_dev_enter(&ptdev->base, &cookie)) { ++ panthor_mmu_as_disable(ptdev, vm->as.id); ++ drm_dev_exit(cookie); ++ } ++ ++ ptdev->mmu->as.slots[vm->as.id].vm = NULL; ++ clear_bit(vm->as.id, &ptdev->mmu->as.alloc_mask); ++ list_del(&vm->as.lru_node); ++ } ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++ ++ free_io_pgtable_ops(vm->pgtbl_ops); ++ ++ drm_mm_takedown(&vm->mm); ++ kfree(vm); ++} ++ ++/** ++ * panthor_vm_put() - Release a reference on a VM ++ * @vm: VM to release the reference on. Can be NULL. ++ */ ++void panthor_vm_put(struct panthor_vm *vm) ++{ ++ drm_gpuvm_put(vm ? &vm->base : NULL); ++} ++ ++/** ++ * panthor_vm_get() - Get a VM reference ++ * @vm: VM to get the reference on. Can be NULL. ++ * ++ * Return: @vm value. ++ */ ++struct panthor_vm *panthor_vm_get(struct panthor_vm *vm) ++{ ++ if (vm) ++ drm_gpuvm_get(&vm->base); ++ ++ return vm; ++} ++ ++/** ++ * panthor_vm_get_heap_pool() - Get the heap pool attached to a VM ++ * @vm: VM to query the heap pool on. ++ * @create: True if the heap pool should be created when it doesn't exist. ++ * ++ * Heap pools are per-VM. This function allows one to retrieve the heap pool ++ * attached to a VM. ++ * ++ * If no heap pool exists yet, and @create is true, we create one. ++ * ++ * The returned panthor_heap_pool should be released with panthor_heap_pool_put(). ++ * ++ * Return: A valid pointer on success, an ERR_PTR() otherwise. ++ */ ++struct panthor_heap_pool *panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create) ++{ ++ struct panthor_heap_pool *pool; ++ ++ mutex_lock(&vm->heaps.lock); ++ if (!vm->heaps.pool && create) { ++ if (vm->destroyed) ++ pool = ERR_PTR(-EINVAL); ++ else ++ pool = panthor_heap_pool_create(vm->ptdev, vm); ++ ++ if (!IS_ERR(pool)) ++ vm->heaps.pool = panthor_heap_pool_get(pool); ++ } else { ++ pool = panthor_heap_pool_get(vm->heaps.pool); ++ } ++ mutex_unlock(&vm->heaps.lock); ++ ++ return pool; ++} ++ ++static u64 mair_to_memattr(u64 mair) ++{ ++ u64 memattr = 0; ++ u32 i; ++ ++ for (i = 0; i < 8; i++) { ++ u8 in_attr = mair >> (8 * i), out_attr; ++ u8 outer = in_attr >> 4, inner = in_attr & 0xf; ++ ++ /* For caching to be enabled, inner and outer caching policy ++ * have to be both write-back, if one of them is write-through ++ * or non-cacheable, we just choose non-cacheable. Device ++ * memory is also translated to non-cacheable. ++ */ ++ if (!(outer & 3) || !(outer & 4) || !(inner & 4)) { ++ out_attr = AS_MEMATTR_AARCH64_INNER_OUTER_NC | ++ AS_MEMATTR_AARCH64_SH_MIDGARD_INNER | ++ AS_MEMATTR_AARCH64_INNER_ALLOC_EXPL(false, false); ++ } else { ++ /* Use SH_CPU_INNER mode so SH_IS, which is used when ++ * IOMMU_CACHE is set, actually maps to the standard ++ * definition of inner-shareable and not Mali's ++ * internal-shareable mode. ++ */ ++ out_attr = AS_MEMATTR_AARCH64_INNER_OUTER_WB | ++ AS_MEMATTR_AARCH64_SH_CPU_INNER | ++ AS_MEMATTR_AARCH64_INNER_ALLOC_EXPL(inner & 1, inner & 2); ++ } ++ ++ memattr |= (u64)out_attr << (8 * i); ++ } ++ ++ return memattr; ++} ++ ++static void panthor_vma_link(struct panthor_vm *vm, ++ struct panthor_vma *vma, ++ struct drm_gpuvm_bo *vm_bo) ++{ ++ struct panthor_gem_object *bo = to_panthor_bo(vma->base.gem.obj); ++ ++ mutex_lock(&bo->gpuva_list_lock); ++ drm_gpuva_link(&vma->base, vm_bo); ++ drm_WARN_ON(&vm->ptdev->base, drm_gpuvm_bo_put(vm_bo)); ++ mutex_unlock(&bo->gpuva_list_lock); ++} ++ ++static void panthor_vma_unlink(struct panthor_vm *vm, ++ struct panthor_vma *vma) ++{ ++ struct panthor_gem_object *bo = to_panthor_bo(vma->base.gem.obj); ++ struct drm_gpuvm_bo *vm_bo = drm_gpuvm_bo_get(vma->base.vm_bo); ++ ++ mutex_lock(&bo->gpuva_list_lock); ++ drm_gpuva_unlink(&vma->base); ++ mutex_unlock(&bo->gpuva_list_lock); ++ ++ /* drm_gpuva_unlink() release the vm_bo, but we manually retained it ++ * when entering this function, so we can implement deferred VMA ++ * destruction. Re-assign it here. ++ */ ++ vma->base.vm_bo = vm_bo; ++ list_add_tail(&vma->node, &vm->op_ctx->returned_vmas); ++} ++ ++static void panthor_vma_init(struct panthor_vma *vma, u32 flags) ++{ ++ INIT_LIST_HEAD(&vma->node); ++ vma->flags = flags; ++} ++ ++#define PANTHOR_VM_MAP_FLAGS \ ++ (DRM_PANTHOR_VM_BIND_OP_MAP_READONLY | \ ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | \ ++ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED) ++ ++static int panthor_gpuva_sm_step_map(struct drm_gpuva_op *op, void *priv) ++{ ++ struct panthor_vm *vm = priv; ++ struct panthor_vm_op_ctx *op_ctx = vm->op_ctx; ++ struct panthor_vma *vma = panthor_vm_op_ctx_get_vma(op_ctx); ++ int ret; ++ ++ if (!vma) ++ return -EINVAL; ++ ++ panthor_vma_init(vma, op_ctx->flags & PANTHOR_VM_MAP_FLAGS); ++ ++ ret = panthor_vm_map_pages(vm, op->map.va.addr, flags_to_prot(vma->flags), ++ op_ctx->map.sgt, op->map.gem.offset, ++ op->map.va.range); ++ if (ret) ++ return ret; ++ ++ /* Ref owned by the mapping now, clear the obj field so we don't release the ++ * pinning/obj ref behind GPUVA's back. ++ */ ++ drm_gpuva_map(&vm->base, &vma->base, &op->map); ++ panthor_vma_link(vm, vma, op_ctx->map.vm_bo); ++ op_ctx->map.vm_bo = NULL; ++ return 0; ++} ++ ++static int panthor_gpuva_sm_step_remap(struct drm_gpuva_op *op, ++ void *priv) ++{ ++ struct panthor_vma *unmap_vma = container_of(op->remap.unmap->va, struct panthor_vma, base); ++ struct panthor_vm *vm = priv; ++ struct panthor_vm_op_ctx *op_ctx = vm->op_ctx; ++ struct panthor_vma *prev_vma = NULL, *next_vma = NULL; ++ u64 unmap_start, unmap_range; ++ int ret; ++ ++ drm_gpuva_op_remap_to_unmap_range(&op->remap, &unmap_start, &unmap_range); ++ ret = panthor_vm_unmap_pages(vm, unmap_start, unmap_range); ++ if (ret) ++ return ret; ++ ++ if (op->remap.prev) { ++ prev_vma = panthor_vm_op_ctx_get_vma(op_ctx); ++ panthor_vma_init(prev_vma, unmap_vma->flags); ++ } ++ ++ if (op->remap.next) { ++ next_vma = panthor_vm_op_ctx_get_vma(op_ctx); ++ panthor_vma_init(next_vma, unmap_vma->flags); ++ } ++ ++ drm_gpuva_remap(prev_vma ? &prev_vma->base : NULL, ++ next_vma ? &next_vma->base : NULL, ++ &op->remap); ++ ++ if (prev_vma) { ++ /* panthor_vma_link() transfers the vm_bo ownership to ++ * the VMA object. Since the vm_bo we're passing is still ++ * owned by the old mapping which will be released when this ++ * mapping is destroyed, we need to grab a ref here. ++ */ ++ panthor_vma_link(vm, prev_vma, ++ drm_gpuvm_bo_get(op->remap.unmap->va->vm_bo)); ++ } ++ ++ if (next_vma) { ++ panthor_vma_link(vm, next_vma, ++ drm_gpuvm_bo_get(op->remap.unmap->va->vm_bo)); ++ } ++ ++ panthor_vma_unlink(vm, unmap_vma); ++ return 0; ++} ++ ++static int panthor_gpuva_sm_step_unmap(struct drm_gpuva_op *op, ++ void *priv) ++{ ++ struct panthor_vma *unmap_vma = container_of(op->unmap.va, struct panthor_vma, base); ++ struct panthor_vm *vm = priv; ++ int ret; ++ ++ ret = panthor_vm_unmap_pages(vm, unmap_vma->base.va.addr, ++ unmap_vma->base.va.range); ++ if (drm_WARN_ON(&vm->ptdev->base, ret)) ++ return ret; ++ ++ drm_gpuva_unmap(&op->unmap); ++ panthor_vma_unlink(vm, unmap_vma); ++ return 0; ++} ++ ++static const struct drm_gpuvm_ops panthor_gpuvm_ops = { ++ .vm_free = panthor_vm_free, ++ .sm_step_map = panthor_gpuva_sm_step_map, ++ .sm_step_remap = panthor_gpuva_sm_step_remap, ++ .sm_step_unmap = panthor_gpuva_sm_step_unmap, ++}; ++ ++/** ++ * panthor_vm_resv() - Get the dma_resv object attached to a VM. ++ * @vm: VM to get the dma_resv of. ++ * ++ * Return: A dma_resv object. ++ */ ++struct dma_resv *panthor_vm_resv(struct panthor_vm *vm) ++{ ++ return drm_gpuvm_resv(&vm->base); ++} ++ ++struct drm_gem_object *panthor_vm_root_gem(struct panthor_vm *vm) ++{ ++ if (!vm) ++ return NULL; ++ ++ return vm->base.r_obj; ++} ++ ++static int ++panthor_vm_exec_op(struct panthor_vm *vm, struct panthor_vm_op_ctx *op, ++ bool flag_vm_unusable_on_failure) ++{ ++ u32 op_type = op->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK; ++ int ret; ++ ++ if (op_type == DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY) ++ return 0; ++ ++ mutex_lock(&vm->op_lock); ++ vm->op_ctx = op; ++ switch (op_type) { ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: ++ if (vm->unusable) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ ret = drm_gpuvm_sm_map(&vm->base, vm, op->va.addr, op->va.range, ++ op->map.vm_bo->obj, op->map.bo_offset); ++ break; ++ ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP: ++ ret = drm_gpuvm_sm_unmap(&vm->base, vm, op->va.addr, op->va.range); ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (ret && flag_vm_unusable_on_failure) ++ vm->unusable = true; ++ ++ vm->op_ctx = NULL; ++ mutex_unlock(&vm->op_lock); ++ ++ return ret; ++} ++ ++static struct dma_fence * ++panthor_vm_bind_run_job(struct drm_sched_job *sched_job) ++{ ++ struct panthor_vm_bind_job *job = container_of(sched_job, struct panthor_vm_bind_job, base); ++ bool cookie; ++ int ret; ++ ++ /* Not only we report an error whose result is propagated to the ++ * drm_sched finished fence, but we also flag the VM as unusable, because ++ * a failure in the async VM_BIND results in an inconsistent state. VM needs ++ * to be destroyed and recreated. ++ */ ++ cookie = dma_fence_begin_signalling(); ++ ret = panthor_vm_exec_op(job->vm, &job->ctx, true); ++ dma_fence_end_signalling(cookie); ++ ++ return ret ? ERR_PTR(ret) : NULL; ++} ++ ++static void panthor_vm_bind_job_release(struct kref *kref) ++{ ++ struct panthor_vm_bind_job *job = container_of(kref, struct panthor_vm_bind_job, refcount); ++ ++ if (job->base.s_fence) ++ drm_sched_job_cleanup(&job->base); ++ ++ panthor_vm_cleanup_op_ctx(&job->ctx, job->vm); ++ panthor_vm_put(job->vm); ++ kfree(job); ++} ++ ++/** ++ * panthor_vm_bind_job_put() - Release a VM_BIND job reference ++ * @sched_job: Job to release the reference on. ++ */ ++void panthor_vm_bind_job_put(struct drm_sched_job *sched_job) ++{ ++ struct panthor_vm_bind_job *job = ++ container_of(sched_job, struct panthor_vm_bind_job, base); ++ ++ if (sched_job) ++ kref_put(&job->refcount, panthor_vm_bind_job_release); ++} ++ ++static void ++panthor_vm_bind_free_job(struct drm_sched_job *sched_job) ++{ ++ struct panthor_vm_bind_job *job = ++ container_of(sched_job, struct panthor_vm_bind_job, base); ++ ++ drm_sched_job_cleanup(sched_job); ++ ++ /* Do the heavy cleanups asynchronously, so we're out of the ++ * dma-signaling path and can acquire dma-resv locks safely. ++ */ ++ queue_work(panthor_cleanup_wq, &job->cleanup_op_ctx_work); ++} ++ ++static enum drm_gpu_sched_stat ++panthor_vm_bind_timedout_job(struct drm_sched_job *sched_job) ++{ ++ WARN(1, "VM_BIND ops are synchronous for now, there should be no timeout!"); ++ return DRM_GPU_SCHED_STAT_NOMINAL; ++} ++ ++static const struct drm_sched_backend_ops panthor_vm_bind_ops = { ++ .run_job = panthor_vm_bind_run_job, ++ .free_job = panthor_vm_bind_free_job, ++ .timedout_job = panthor_vm_bind_timedout_job, ++}; ++ ++/** ++ * panthor_vm_create() - Create a VM ++ * @ptdev: Device. ++ * @for_mcu: True if this is the FW MCU VM. ++ * @kernel_va_start: Start of the range reserved for kernel BO mapping. ++ * @kernel_va_size: Size of the range reserved for kernel BO mapping. ++ * @auto_kernel_va_start: Start of the auto-VA kernel range. ++ * @auto_kernel_va_size: Size of the auto-VA kernel range. ++ * ++ * Return: A valid pointer on success, an ERR_PTR() otherwise. ++ */ ++struct panthor_vm * ++panthor_vm_create(struct panthor_device *ptdev, bool for_mcu, ++ u64 kernel_va_start, u64 kernel_va_size, ++ u64 auto_kernel_va_start, u64 auto_kernel_va_size) ++{ ++ u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features); ++ u32 pa_bits = GPU_MMU_FEATURES_PA_BITS(ptdev->gpu_info.mmu_features); ++ u64 full_va_range = 1ull << va_bits; ++ struct drm_gem_object *dummy_gem; ++ struct drm_gpu_scheduler *sched; ++ struct io_pgtable_cfg pgtbl_cfg; ++ u64 mair, min_va, va_range; ++ struct panthor_vm *vm; ++ int ret; ++ ++ vm = kzalloc(sizeof(*vm), GFP_KERNEL); ++ if (!vm) ++ return ERR_PTR(-ENOMEM); ++ ++ /* We allocate a dummy GEM for the VM. */ ++ dummy_gem = drm_gpuvm_resv_object_alloc(&ptdev->base); ++ if (!dummy_gem) { ++ ret = -ENOMEM; ++ goto err_free_vm; ++ } ++ ++ mutex_init(&vm->heaps.lock); ++ vm->for_mcu = for_mcu; ++ vm->ptdev = ptdev; ++ mutex_init(&vm->op_lock); ++ ++ if (for_mcu) { ++ /* CSF MCU is a cortex M7, and can only address 4G */ ++ min_va = 0; ++ va_range = SZ_4G; ++ } else { ++ min_va = 0; ++ va_range = full_va_range; ++ } ++ ++ mutex_init(&vm->mm_lock); ++ drm_mm_init(&vm->mm, kernel_va_start, kernel_va_size); ++ vm->kernel_auto_va.start = auto_kernel_va_start; ++ vm->kernel_auto_va.end = vm->kernel_auto_va.start + auto_kernel_va_size - 1; ++ ++ INIT_LIST_HEAD(&vm->node); ++ INIT_LIST_HEAD(&vm->as.lru_node); ++ vm->as.id = -1; ++ refcount_set(&vm->as.active_cnt, 0); ++ ++ pgtbl_cfg = (struct io_pgtable_cfg) { ++ .pgsize_bitmap = SZ_4K | SZ_2M, ++ .ias = va_bits, ++ .oas = pa_bits, ++ .coherent_walk = ptdev->coherent, ++ .tlb = &mmu_tlb_ops, ++ .iommu_dev = ptdev->base.dev, ++ .alloc = alloc_pt, ++ .free = free_pt, ++ }; ++ ++ vm->pgtbl_ops = alloc_io_pgtable_ops(ARM_64_LPAE_S1, &pgtbl_cfg, vm); ++ if (!vm->pgtbl_ops) { ++ ret = -EINVAL; ++ goto err_mm_takedown; ++ } ++ ++ /* Bind operations are synchronous for now, no timeout needed. */ ++ ret = drm_sched_init(&vm->sched, &panthor_vm_bind_ops, ptdev->mmu->vm.wq, ++ 1, 1, 0, ++ MAX_SCHEDULE_TIMEOUT, NULL, NULL, ++ "panthor-vm-bind", ptdev->base.dev); ++ if (ret) ++ goto err_free_io_pgtable; ++ ++ sched = &vm->sched; ++ ret = drm_sched_entity_init(&vm->entity, 0, &sched, 1, NULL); ++ if (ret) ++ goto err_sched_fini; ++ ++ mair = io_pgtable_ops_to_pgtable(vm->pgtbl_ops)->cfg.arm_lpae_s1_cfg.mair; ++ vm->memattr = mair_to_memattr(mair); ++ ++ mutex_lock(&ptdev->mmu->vm.lock); ++ list_add_tail(&vm->node, &ptdev->mmu->vm.list); ++ ++ /* If a reset is in progress, stop the scheduler. */ ++ if (ptdev->mmu->vm.reset_in_progress) ++ panthor_vm_stop(vm); ++ mutex_unlock(&ptdev->mmu->vm.lock); ++ ++ /* We intentionally leave the reserved range to zero, because we want kernel VMAs ++ * to be handled the same way user VMAs are. ++ */ ++ drm_gpuvm_init(&vm->base, for_mcu ? "panthor-MCU-VM" : "panthor-GPU-VM", ++ DRM_GPUVM_RESV_PROTECTED, &ptdev->base, dummy_gem, ++ min_va, va_range, 0, 0, &panthor_gpuvm_ops); ++ drm_gem_object_put(dummy_gem); ++ return vm; ++ ++err_sched_fini: ++ drm_sched_fini(&vm->sched); ++ ++err_free_io_pgtable: ++ free_io_pgtable_ops(vm->pgtbl_ops); ++ ++err_mm_takedown: ++ drm_mm_takedown(&vm->mm); ++ drm_gem_object_put(dummy_gem); ++ ++err_free_vm: ++ kfree(vm); ++ return ERR_PTR(ret); ++} ++ ++static int ++panthor_vm_bind_prepare_op_ctx(struct drm_file *file, ++ struct panthor_vm *vm, ++ const struct drm_panthor_vm_bind_op *op, ++ struct panthor_vm_op_ctx *op_ctx) ++{ ++ struct drm_gem_object *gem; ++ int ret; ++ ++ /* Aligned on page size. */ ++ if ((op->va | op->size) & ~PAGE_MASK) ++ return -EINVAL; ++ ++ switch (op->flags & DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) { ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: ++ gem = drm_gem_object_lookup(file, op->bo_handle); ++ ret = panthor_vm_prepare_map_op_ctx(op_ctx, vm, ++ gem ? to_panthor_bo(gem) : NULL, ++ op->bo_offset, ++ op->size, ++ op->va, ++ op->flags); ++ drm_gem_object_put(gem); ++ return ret; ++ ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP: ++ if (op->flags & ~DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) ++ return -EINVAL; ++ ++ if (op->bo_handle || op->bo_offset) ++ return -EINVAL; ++ ++ return panthor_vm_prepare_unmap_op_ctx(op_ctx, vm, op->va, op->size); ++ ++ case DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY: ++ if (op->flags & ~DRM_PANTHOR_VM_BIND_OP_TYPE_MASK) ++ return -EINVAL; ++ ++ if (op->bo_handle || op->bo_offset) ++ return -EINVAL; ++ ++ if (op->va || op->size) ++ return -EINVAL; ++ ++ if (!op->syncs.count) ++ return -EINVAL; ++ ++ panthor_vm_prepare_sync_only_op_ctx(op_ctx, vm); ++ return 0; ++ ++ default: ++ return -EINVAL; ++ } ++} ++ ++static void panthor_vm_bind_job_cleanup_op_ctx_work(struct work_struct *work) ++{ ++ struct panthor_vm_bind_job *job = ++ container_of(work, struct panthor_vm_bind_job, cleanup_op_ctx_work); ++ ++ panthor_vm_bind_job_put(&job->base); ++} ++ ++/** ++ * panthor_vm_bind_job_create() - Create a VM_BIND job ++ * @file: File. ++ * @vm: VM targeted by the VM_BIND job. ++ * @op: VM operation data. ++ * ++ * Return: A valid pointer on success, an ERR_PTR() otherwise. ++ */ ++struct drm_sched_job * ++panthor_vm_bind_job_create(struct drm_file *file, ++ struct panthor_vm *vm, ++ const struct drm_panthor_vm_bind_op *op) ++{ ++ struct panthor_vm_bind_job *job; ++ int ret; ++ ++ if (!vm) ++ return ERR_PTR(-EINVAL); ++ ++ if (vm->destroyed || vm->unusable) ++ return ERR_PTR(-EINVAL); ++ ++ job = kzalloc(sizeof(*job), GFP_KERNEL); ++ if (!job) ++ return ERR_PTR(-ENOMEM); ++ ++ ret = panthor_vm_bind_prepare_op_ctx(file, vm, op, &job->ctx); ++ if (ret) { ++ kfree(job); ++ return ERR_PTR(ret); ++ } ++ ++ INIT_WORK(&job->cleanup_op_ctx_work, panthor_vm_bind_job_cleanup_op_ctx_work); ++ kref_init(&job->refcount); ++ job->vm = panthor_vm_get(vm); ++ ++ ret = drm_sched_job_init(&job->base, &vm->entity, 1, vm); ++ if (ret) ++ goto err_put_job; ++ ++ return &job->base; ++ ++err_put_job: ++ panthor_vm_bind_job_put(&job->base); ++ return ERR_PTR(ret); ++} ++ ++/** ++ * panthor_vm_bind_job_prepare_resvs() - Prepare VM_BIND job dma_resvs ++ * @exec: The locking/preparation context. ++ * @sched_job: The job to prepare resvs on. ++ * ++ * Locks and prepare the VM resv. ++ * ++ * If this is a map operation, locks and prepares the GEM resv. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_bind_job_prepare_resvs(struct drm_exec *exec, ++ struct drm_sched_job *sched_job) ++{ ++ struct panthor_vm_bind_job *job = container_of(sched_job, struct panthor_vm_bind_job, base); ++ int ret; ++ ++ /* Acquire the VM lock an reserve a slot for this VM bind job. */ ++ ret = drm_gpuvm_prepare_vm(&job->vm->base, exec, 1); ++ if (ret) ++ return ret; ++ ++ if (job->ctx.map.vm_bo) { ++ /* Lock/prepare the GEM being mapped. */ ++ ret = drm_exec_prepare_obj(exec, job->ctx.map.vm_bo->obj, 1); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_vm_bind_job_update_resvs() - Update the resv objects touched by a job ++ * @exec: drm_exec context. ++ * @sched_job: Job to update the resvs on. ++ */ ++void panthor_vm_bind_job_update_resvs(struct drm_exec *exec, ++ struct drm_sched_job *sched_job) ++{ ++ struct panthor_vm_bind_job *job = container_of(sched_job, struct panthor_vm_bind_job, base); ++ ++ /* Explicit sync => we just register our job finished fence as bookkeep. */ ++ drm_gpuvm_resv_add_fence(&job->vm->base, exec, ++ &sched_job->s_fence->finished, ++ DMA_RESV_USAGE_BOOKKEEP, ++ DMA_RESV_USAGE_BOOKKEEP); ++} ++ ++void panthor_vm_update_resvs(struct panthor_vm *vm, struct drm_exec *exec, ++ struct dma_fence *fence, ++ enum dma_resv_usage private_usage, ++ enum dma_resv_usage extobj_usage) ++{ ++ drm_gpuvm_resv_add_fence(&vm->base, exec, fence, private_usage, extobj_usage); ++} ++ ++/** ++ * panthor_vm_bind_exec_sync_op() - Execute a VM_BIND operation synchronously. ++ * @file: File. ++ * @vm: VM targeted by the VM operation. ++ * @op: Data describing the VM operation. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_bind_exec_sync_op(struct drm_file *file, ++ struct panthor_vm *vm, ++ struct drm_panthor_vm_bind_op *op) ++{ ++ struct panthor_vm_op_ctx op_ctx; ++ int ret; ++ ++ /* No sync objects allowed on synchronous operations. */ ++ if (op->syncs.count) ++ return -EINVAL; ++ ++ if (!op->size) ++ return 0; ++ ++ ret = panthor_vm_bind_prepare_op_ctx(file, vm, op, &op_ctx); ++ if (ret) ++ return ret; ++ ++ ret = panthor_vm_exec_op(vm, &op_ctx, false); ++ panthor_vm_cleanup_op_ctx(&op_ctx, vm); ++ ++ return ret; ++} ++ ++/** ++ * panthor_vm_map_bo_range() - Map a GEM object range to a VM ++ * @vm: VM to map the GEM to. ++ * @bo: GEM object to map. ++ * @offset: Offset in the GEM object. ++ * @size: Size to map. ++ * @va: Virtual address to map the object to. ++ * @flags: Combination of drm_panthor_vm_bind_op_flags flags. ++ * Only map-related flags are valid. ++ * ++ * Internal use only. For userspace requests, use ++ * panthor_vm_bind_exec_sync_op() instead. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_map_bo_range(struct panthor_vm *vm, struct panthor_gem_object *bo, ++ u64 offset, u64 size, u64 va, u32 flags) ++{ ++ struct panthor_vm_op_ctx op_ctx; ++ int ret; ++ ++ ret = panthor_vm_prepare_map_op_ctx(&op_ctx, vm, bo, offset, size, va, flags); ++ if (ret) ++ return ret; ++ ++ ret = panthor_vm_exec_op(vm, &op_ctx, false); ++ panthor_vm_cleanup_op_ctx(&op_ctx, vm); ++ ++ return ret; ++} ++ ++/** ++ * panthor_vm_unmap_range() - Unmap a portion of the VA space ++ * @vm: VM to unmap the region from. ++ * @va: Virtual address to unmap. Must be 4k aligned. ++ * @size: Size of the region to unmap. Must be 4k aligned. ++ * ++ * Internal use only. For userspace requests, use ++ * panthor_vm_bind_exec_sync_op() instead. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_unmap_range(struct panthor_vm *vm, u64 va, u64 size) ++{ ++ struct panthor_vm_op_ctx op_ctx; ++ int ret; ++ ++ ret = panthor_vm_prepare_unmap_op_ctx(&op_ctx, vm, va, size); ++ if (ret) ++ return ret; ++ ++ ret = panthor_vm_exec_op(vm, &op_ctx, false); ++ panthor_vm_cleanup_op_ctx(&op_ctx, vm); ++ ++ return ret; ++} ++ ++/** ++ * panthor_vm_prepare_mapped_bos_resvs() - Prepare resvs on VM BOs. ++ * @exec: Locking/preparation context. ++ * @vm: VM targeted by the GPU job. ++ * @slot_count: Number of slots to reserve. ++ * ++ * GPU jobs assume all BOs bound to the VM at the time the job is submitted ++ * are available when the job is executed. In order to guarantee that, we ++ * need to reserve a slot on all BOs mapped to a VM and update this slot with ++ * the job fence after its submission. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_vm_prepare_mapped_bos_resvs(struct drm_exec *exec, struct panthor_vm *vm, ++ u32 slot_count) ++{ ++ int ret; ++ ++ /* Acquire the VM lock and reserve a slot for this GPU job. */ ++ ret = drm_gpuvm_prepare_vm(&vm->base, exec, slot_count); ++ if (ret) ++ return ret; ++ ++ return drm_gpuvm_prepare_objects(&vm->base, exec, slot_count); ++} ++ ++/** ++ * panthor_mmu_unplug() - Unplug the MMU logic ++ * @ptdev: Device. ++ * ++ * No access to the MMU regs should be done after this function is called. ++ * We suspend the IRQ and disable all VMs to guarantee that. ++ */ ++void panthor_mmu_unplug(struct panthor_device *ptdev) ++{ ++ panthor_mmu_irq_suspend(&ptdev->mmu->irq); ++ ++ mutex_lock(&ptdev->mmu->as.slots_lock); ++ for (u32 i = 0; i < ARRAY_SIZE(ptdev->mmu->as.slots); i++) { ++ struct panthor_vm *vm = ptdev->mmu->as.slots[i].vm; ++ ++ if (vm) { ++ drm_WARN_ON(&ptdev->base, panthor_mmu_as_disable(ptdev, i)); ++ panthor_vm_release_as_locked(vm); ++ } ++ } ++ mutex_unlock(&ptdev->mmu->as.slots_lock); ++} ++ ++static void panthor_mmu_release_wq(struct drm_device *ddev, void *res) ++{ ++ destroy_workqueue(res); ++} ++ ++/** ++ * panthor_mmu_init() - Initialize the MMU logic. ++ * @ptdev: Device. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_mmu_init(struct panthor_device *ptdev) ++{ ++ u32 va_bits = GPU_MMU_FEATURES_VA_BITS(ptdev->gpu_info.mmu_features); ++ struct panthor_mmu *mmu; ++ int ret, irq; ++ ++ mmu = drmm_kzalloc(&ptdev->base, sizeof(*mmu), GFP_KERNEL); ++ if (!mmu) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&mmu->as.lru_list); ++ ++ ret = drmm_mutex_init(&ptdev->base, &mmu->as.slots_lock); ++ if (ret) ++ return ret; ++ ++ INIT_LIST_HEAD(&mmu->vm.list); ++ ret = drmm_mutex_init(&ptdev->base, &mmu->vm.lock); ++ if (ret) ++ return ret; ++ ++ ptdev->mmu = mmu; ++ ++ irq = platform_get_irq_byname(to_platform_device(ptdev->base.dev), "mmu"); ++ if (irq <= 0) ++ return -ENODEV; ++ ++ ret = panthor_request_mmu_irq(ptdev, &mmu->irq, irq, ++ panthor_mmu_fault_mask(ptdev, ~0)); ++ if (ret) ++ return ret; ++ ++ mmu->vm.wq = alloc_workqueue("panthor-vm-bind", WQ_UNBOUND, 0); ++ if (!mmu->vm.wq) ++ return -ENOMEM; ++ ++ /* On 32-bit kernels, the VA space is limited by the io_pgtable_ops abstraction, ++ * which passes iova as an unsigned long. Patch the mmu_features to reflect this ++ * limitation. ++ */ ++ if (sizeof(unsigned long) * 8 < va_bits) { ++ ptdev->gpu_info.mmu_features &= ~GENMASK(7, 0); ++ ptdev->gpu_info.mmu_features |= sizeof(unsigned long) * 8; ++ } ++ ++ return drmm_add_action_or_reset(&ptdev->base, panthor_mmu_release_wq, mmu->vm.wq); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++static int show_vm_gpuvas(struct panthor_vm *vm, struct seq_file *m) ++{ ++ int ret; ++ ++ mutex_lock(&vm->op_lock); ++ ret = drm_debugfs_gpuva_info(m, &vm->base); ++ mutex_unlock(&vm->op_lock); ++ ++ return ret; ++} ++ ++static int show_each_vm(struct seq_file *m, void *arg) ++{ ++ struct drm_info_node *node = (struct drm_info_node *)m->private; ++ struct drm_device *ddev = node->minor->dev; ++ struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); ++ int (*show)(struct panthor_vm *, struct seq_file *) = node->info_ent->data; ++ struct panthor_vm *vm; ++ int ret = 0; ++ ++ mutex_lock(&ptdev->mmu->vm.lock); ++ list_for_each_entry(vm, &ptdev->mmu->vm.list, node) { ++ ret = show(vm, m); ++ if (ret < 0) ++ break; ++ ++ seq_puts(m, "\n"); ++ } ++ mutex_unlock(&ptdev->mmu->vm.lock); ++ ++ return ret; ++} ++ ++static struct drm_info_list panthor_mmu_debugfs_list[] = { ++ DRM_DEBUGFS_GPUVA_INFO(show_each_vm, show_vm_gpuvas), ++}; ++ ++/** ++ * panthor_mmu_debugfs_init() - Initialize MMU debugfs entries ++ * @minor: Minor. ++ */ ++void panthor_mmu_debugfs_init(struct drm_minor *minor) ++{ ++ drm_debugfs_create_files(panthor_mmu_debugfs_list, ++ ARRAY_SIZE(panthor_mmu_debugfs_list), ++ minor->debugfs_root, minor); ++} ++#endif /* CONFIG_DEBUG_FS */ ++ ++/** ++ * panthor_mmu_pt_cache_init() - Initialize the page table cache. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_mmu_pt_cache_init(void) ++{ ++ pt_cache = kmem_cache_create("panthor-mmu-pt", SZ_4K, SZ_4K, 0, NULL); ++ if (!pt_cache) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++/** ++ * panthor_mmu_pt_cache_fini() - Destroy the page table cache. ++ */ ++void panthor_mmu_pt_cache_fini(void) ++{ ++ kmem_cache_destroy(pt_cache); ++} +diff --git a/drivers/gpu/drm/panthor/panthor_mmu.h b/drivers/gpu/drm/panthor/panthor_mmu.h +new file mode 100644 +index 000000000000..f3c1ed19f973 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_mmu.h +@@ -0,0 +1,102 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2019 Linaro, Ltd, Rob Herring */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifndef __PANTHOR_MMU_H__ ++#define __PANTHOR_MMU_H__ ++ ++#include ++ ++struct drm_exec; ++struct drm_sched_job; ++struct panthor_gem_object; ++struct panthor_heap_pool; ++struct panthor_vm; ++struct panthor_vma; ++struct panthor_mmu; ++ ++int panthor_mmu_init(struct panthor_device *ptdev); ++void panthor_mmu_unplug(struct panthor_device *ptdev); ++void panthor_mmu_pre_reset(struct panthor_device *ptdev); ++void panthor_mmu_post_reset(struct panthor_device *ptdev); ++void panthor_mmu_suspend(struct panthor_device *ptdev); ++void panthor_mmu_resume(struct panthor_device *ptdev); ++ ++int panthor_vm_map_bo_range(struct panthor_vm *vm, struct panthor_gem_object *bo, ++ u64 offset, u64 size, u64 va, u32 flags); ++int panthor_vm_unmap_range(struct panthor_vm *vm, u64 va, u64 size); ++struct panthor_gem_object * ++panthor_vm_get_bo_for_va(struct panthor_vm *vm, u64 va, u64 *bo_offset); ++ ++int panthor_vm_active(struct panthor_vm *vm); ++void panthor_vm_idle(struct panthor_vm *vm); ++int panthor_vm_as(struct panthor_vm *vm); ++ ++struct panthor_heap_pool * ++panthor_vm_get_heap_pool(struct panthor_vm *vm, bool create); ++ ++struct panthor_vm *panthor_vm_get(struct panthor_vm *vm); ++void panthor_vm_put(struct panthor_vm *vm); ++struct panthor_vm *panthor_vm_create(struct panthor_device *ptdev, bool for_mcu, ++ u64 kernel_va_start, u64 kernel_va_size, ++ u64 kernel_auto_va_start, ++ u64 kernel_auto_va_size); ++ ++int panthor_vm_prepare_mapped_bos_resvs(struct drm_exec *exec, ++ struct panthor_vm *vm, ++ u32 slot_count); ++int panthor_vm_add_bos_resvs_deps_to_job(struct panthor_vm *vm, ++ struct drm_sched_job *job); ++void panthor_vm_add_job_fence_to_bos_resvs(struct panthor_vm *vm, ++ struct drm_sched_job *job); ++ ++struct dma_resv *panthor_vm_resv(struct panthor_vm *vm); ++struct drm_gem_object *panthor_vm_root_gem(struct panthor_vm *vm); ++ ++void panthor_vm_pool_destroy(struct panthor_file *pfile); ++int panthor_vm_pool_create(struct panthor_file *pfile); ++int panthor_vm_pool_create_vm(struct panthor_device *ptdev, ++ struct panthor_vm_pool *pool, ++ struct drm_panthor_vm_create *args); ++int panthor_vm_pool_destroy_vm(struct panthor_vm_pool *pool, u32 handle); ++struct panthor_vm *panthor_vm_pool_get_vm(struct panthor_vm_pool *pool, u32 handle); ++ ++bool panthor_vm_has_unhandled_faults(struct panthor_vm *vm); ++bool panthor_vm_is_unusable(struct panthor_vm *vm); ++ ++/* ++ * PANTHOR_VM_KERNEL_AUTO_VA: Use this magic address when you want the GEM ++ * logic to auto-allocate the virtual address in the reserved kernel VA range. ++ */ ++#define PANTHOR_VM_KERNEL_AUTO_VA ~0ull ++ ++int panthor_vm_alloc_va(struct panthor_vm *vm, u64 va, u64 size, ++ struct drm_mm_node *va_node); ++void panthor_vm_free_va(struct panthor_vm *vm, struct drm_mm_node *va_node); ++ ++int panthor_vm_bind_exec_sync_op(struct drm_file *file, ++ struct panthor_vm *vm, ++ struct drm_panthor_vm_bind_op *op); ++ ++struct drm_sched_job * ++panthor_vm_bind_job_create(struct drm_file *file, ++ struct panthor_vm *vm, ++ const struct drm_panthor_vm_bind_op *op); ++void panthor_vm_bind_job_put(struct drm_sched_job *job); ++int panthor_vm_bind_job_prepare_resvs(struct drm_exec *exec, ++ struct drm_sched_job *job); ++void panthor_vm_bind_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *job); ++ ++void panthor_vm_update_resvs(struct panthor_vm *vm, struct drm_exec *exec, ++ struct dma_fence *fence, ++ enum dma_resv_usage private_usage, ++ enum dma_resv_usage extobj_usage); ++ ++int panthor_mmu_pt_cache_init(void); ++void panthor_mmu_pt_cache_fini(void); ++ ++#ifdef CONFIG_DEBUG_FS ++void panthor_mmu_debugfs_init(struct drm_minor *minor); ++#endif ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:22 +0100 +Subject: drm/panthor: Add the FW logical block + +Contains everything that's FW related, that includes the code dealing +with the microcontroller unit (MCU) that's running the FW, and anything +related to allocating memory shared between the FW and the CPU. + +A few global FW events are processed in the IRQ handler, the rest is +forwarded to the scheduler, since scheduling is the primary reason for +the FW existence, and also the main source of FW <-> kernel +interactions. + +v6: +- Add Maxime's and Heiko's acks +- Keep header inclusion alphabetically ordered + +v5: +- Fix typo in GLB_PERFCNT_SAMPLE definition +- Fix unbalanced panthor_vm_idle/active() calls +- Fallback to a slow reset when the fast reset fails +- Add extra information when reporting a FW boot failure + +v4: +- Add a MODULE_FIRMWARE() entry for gen 10.8 +- Fix a wrong return ERR_PTR() in panthor_fw_load_section_entry() +- Fix typos +- Add Steve's R-b + +v3: +- Make the FW path more future-proof (Liviu) +- Use one waitqueue for all FW events +- Simplify propagation of FW events to the scheduler logic +- Drop the panthor_fw_mem abstraction and use panthor_kernel_bo instead +- Account for the panthor_vm changes +- Replace magic number with 0x7fffffff with ~0 to better signify that + it's the maximum permitted value. +- More accurate rounding when computing the firmware timeout. +- Add a 'sub iterator' helper function. This also adds a check that a + firmware entry doesn't overflow the firmware image. +- Drop __packed from FW structures, natural alignment is good enough. +- Other minor code improvements. + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_fw.c | 1362 ++++++++++ + drivers/gpu/drm/panthor/panthor_fw.h | 503 ++++ + 2 files changed, 1865 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_fw.c b/drivers/gpu/drm/panthor/panthor_fw.c +new file mode 100644 +index 000000000000..33c87a59834e +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_fw.c +@@ -0,0 +1,1362 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifdef CONFIG_ARM_ARCH_TIMER ++#include ++#endif ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "panthor_device.h" ++#include "panthor_fw.h" ++#include "panthor_gem.h" ++#include "panthor_gpu.h" ++#include "panthor_mmu.h" ++#include "panthor_regs.h" ++#include "panthor_sched.h" ++ ++#define CSF_FW_NAME "mali_csffw.bin" ++ ++#define PING_INTERVAL_MS 12000 ++#define PROGRESS_TIMEOUT_CYCLES (5ull * 500 * 1024 * 1024) ++#define PROGRESS_TIMEOUT_SCALE_SHIFT 10 ++#define IDLE_HYSTERESIS_US 800 ++#define PWROFF_HYSTERESIS_US 10000 ++ ++/** ++ * struct panthor_fw_binary_hdr - Firmware binary header. ++ */ ++struct panthor_fw_binary_hdr { ++ /** @magic: Magic value to check binary validity. */ ++ u32 magic; ++#define CSF_FW_BINARY_HEADER_MAGIC 0xc3f13a6e ++ ++ /** @minor: Minor FW version. */ ++ u8 minor; ++ ++ /** @major: Major FW version. */ ++ u8 major; ++#define CSF_FW_BINARY_HEADER_MAJOR_MAX 0 ++ ++ /** @padding1: MBZ. */ ++ u16 padding1; ++ ++ /** @version_hash: FW version hash. */ ++ u32 version_hash; ++ ++ /** @padding2: MBZ. */ ++ u32 padding2; ++ ++ /** @size: FW binary size. */ ++ u32 size; ++}; ++ ++/** ++ * enum panthor_fw_binary_entry_type - Firmware binary entry type ++ */ ++enum panthor_fw_binary_entry_type { ++ /** @CSF_FW_BINARY_ENTRY_TYPE_IFACE: Host <-> FW interface. */ ++ CSF_FW_BINARY_ENTRY_TYPE_IFACE = 0, ++ ++ /** @CSF_FW_BINARY_ENTRY_TYPE_CONFIG: FW config. */ ++ CSF_FW_BINARY_ENTRY_TYPE_CONFIG = 1, ++ ++ /** @CSF_FW_BINARY_ENTRY_TYPE_FUTF_TEST: Unit-tests. */ ++ CSF_FW_BINARY_ENTRY_TYPE_FUTF_TEST = 2, ++ ++ /** @CSF_FW_BINARY_ENTRY_TYPE_TRACE_BUFFER: Trace buffer interface. */ ++ CSF_FW_BINARY_ENTRY_TYPE_TRACE_BUFFER = 3, ++ ++ /** @CSF_FW_BINARY_ENTRY_TYPE_TIMELINE_METADATA: Timeline metadata interface. */ ++ CSF_FW_BINARY_ENTRY_TYPE_TIMELINE_METADATA = 4, ++}; ++ ++#define CSF_FW_BINARY_ENTRY_TYPE(ehdr) ((ehdr) & 0xff) ++#define CSF_FW_BINARY_ENTRY_SIZE(ehdr) (((ehdr) >> 8) & 0xff) ++#define CSF_FW_BINARY_ENTRY_UPDATE BIT(30) ++#define CSF_FW_BINARY_ENTRY_OPTIONAL BIT(31) ++ ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_RD BIT(0) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_WR BIT(1) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_EX BIT(2) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_NONE (0 << 3) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED (1 << 3) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_UNCACHED_COHERENT (2 << 3) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED_COHERENT (3 << 3) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK GENMASK(4, 3) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_PROT BIT(5) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED BIT(30) ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO BIT(31) ++ ++#define CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS \ ++ (CSF_FW_BINARY_IFACE_ENTRY_RD_RD | \ ++ CSF_FW_BINARY_IFACE_ENTRY_RD_WR | \ ++ CSF_FW_BINARY_IFACE_ENTRY_RD_EX | \ ++ CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK | \ ++ CSF_FW_BINARY_IFACE_ENTRY_RD_PROT | \ ++ CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED | \ ++ CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO) ++ ++/** ++ * struct panthor_fw_binary_section_entry_hdr - Describes a section of FW binary ++ */ ++struct panthor_fw_binary_section_entry_hdr { ++ /** @flags: Section flags. */ ++ u32 flags; ++ ++ /** @va: MCU virtual range to map this binary section to. */ ++ struct { ++ /** @start: Start address. */ ++ u32 start; ++ ++ /** @end: End address. */ ++ u32 end; ++ } va; ++ ++ /** @data: Data to initialize the FW section with. */ ++ struct { ++ /** @start: Start offset in the FW binary. */ ++ u32 start; ++ ++ /** @end: End offset in the FW binary. */ ++ u32 end; ++ } data; ++}; ++ ++/** ++ * struct panthor_fw_binary_iter - Firmware binary iterator ++ * ++ * Used to parse a firmware binary. ++ */ ++struct panthor_fw_binary_iter { ++ /** @data: FW binary data. */ ++ const void *data; ++ ++ /** @size: FW binary size. */ ++ size_t size; ++ ++ /** @offset: Iterator offset. */ ++ size_t offset; ++}; ++ ++/** ++ * struct panthor_fw_section - FW section ++ */ ++struct panthor_fw_section { ++ /** @node: Used to keep track of FW sections. */ ++ struct list_head node; ++ ++ /** @flags: Section flags, as encoded in the FW binary. */ ++ u32 flags; ++ ++ /** @mem: Section memory. */ ++ struct panthor_kernel_bo *mem; ++ ++ /** ++ * @name: Name of the section, as specified in the binary. ++ * ++ * Can be NULL. ++ */ ++ const char *name; ++ ++ /** ++ * @data: Initial data copied to the FW memory. ++ * ++ * We keep data around so we can reload sections after a reset. ++ */ ++ struct { ++ /** @buf: Buffed used to store init data. */ ++ const void *buf; ++ ++ /** @size: Size of @buf in bytes. */ ++ size_t size; ++ } data; ++}; ++ ++#define CSF_MCU_SHARED_REGION_START 0x04000000ULL ++#define CSF_MCU_SHARED_REGION_SIZE 0x04000000ULL ++ ++#define MIN_CS_PER_CSG 8 ++#define MIN_CSGS 3 ++#define MAX_CSG_PRIO 0xf ++ ++#define CSF_IFACE_VERSION(major, minor, patch) \ ++ (((major) << 24) | ((minor) << 16) | (patch)) ++#define CSF_IFACE_VERSION_MAJOR(v) ((v) >> 24) ++#define CSF_IFACE_VERSION_MINOR(v) (((v) >> 16) & 0xff) ++#define CSF_IFACE_VERSION_PATCH(v) ((v) & 0xffff) ++ ++#define CSF_GROUP_CONTROL_OFFSET 0x1000 ++#define CSF_STREAM_CONTROL_OFFSET 0x40 ++#define CSF_UNPRESERVED_REG_COUNT 4 ++ ++/** ++ * struct panthor_fw_iface - FW interfaces ++ */ ++struct panthor_fw_iface { ++ /** @global: Global interface. */ ++ struct panthor_fw_global_iface global; ++ ++ /** @groups: Group slot interfaces. */ ++ struct panthor_fw_csg_iface groups[MAX_CSGS]; ++ ++ /** @streams: Command stream slot interfaces. */ ++ struct panthor_fw_cs_iface streams[MAX_CSGS][MAX_CS_PER_CSG]; ++}; ++ ++/** ++ * struct panthor_fw - Firmware management ++ */ ++struct panthor_fw { ++ /** @vm: MCU VM. */ ++ struct panthor_vm *vm; ++ ++ /** @sections: List of FW sections. */ ++ struct list_head sections; ++ ++ /** @shared_section: The section containing the FW interfaces. */ ++ struct panthor_fw_section *shared_section; ++ ++ /** @iface: FW interfaces. */ ++ struct panthor_fw_iface iface; ++ ++ /** @watchdog: Collection of fields relating to the FW watchdog. */ ++ struct { ++ /** @ping_work: Delayed work used to ping the FW. */ ++ struct delayed_work ping_work; ++ } watchdog; ++ ++ /** ++ * @req_waitqueue: FW request waitqueue. ++ * ++ * Everytime a request is sent to a command stream group or the global ++ * interface, the caller will first busy wait for the request to be ++ * acknowledged, and then fallback to a sleeping wait. ++ * ++ * This wait queue is here to support the sleeping wait flavor. ++ */ ++ wait_queue_head_t req_waitqueue; ++ ++ /** @booted: True is the FW is booted */ ++ bool booted; ++ ++ /** ++ * @fast_reset: True if the post_reset logic can proceed with a fast reset. ++ * ++ * A fast reset is just a reset where the driver doesn't reload the FW sections. ++ * ++ * Any time the firmware is properly suspended, a fast reset can take place. ++ * On the other hand, if the halt operation failed, the driver will reload ++ * all sections to make sure we start from a fresh state. ++ */ ++ bool fast_reset; ++ ++ /** @irq: Job irq data. */ ++ struct panthor_irq irq; ++}; ++ ++struct panthor_vm *panthor_fw_vm(struct panthor_device *ptdev) ++{ ++ return ptdev->fw->vm; ++} ++ ++/** ++ * panthor_fw_get_glb_iface() - Get the global interface ++ * @ptdev: Device. ++ * ++ * Return: The global interface. ++ */ ++struct panthor_fw_global_iface * ++panthor_fw_get_glb_iface(struct panthor_device *ptdev) ++{ ++ return &ptdev->fw->iface.global; ++} ++ ++/** ++ * panthor_fw_get_csg_iface() - Get a command stream group slot interface ++ * @ptdev: Device. ++ * @csg_slot: Index of the command stream group slot. ++ * ++ * Return: The command stream group slot interface. ++ */ ++struct panthor_fw_csg_iface * ++panthor_fw_get_csg_iface(struct panthor_device *ptdev, u32 csg_slot) ++{ ++ if (drm_WARN_ON(&ptdev->base, csg_slot >= MAX_CSGS)) ++ return NULL; ++ ++ return &ptdev->fw->iface.groups[csg_slot]; ++} ++ ++/** ++ * panthor_fw_get_cs_iface() - Get a command stream slot interface ++ * @ptdev: Device. ++ * @csg_slot: Index of the command stream group slot. ++ * @cs_slot: Index of the command stream slot. ++ * ++ * Return: The command stream slot interface. ++ */ ++struct panthor_fw_cs_iface * ++panthor_fw_get_cs_iface(struct panthor_device *ptdev, u32 csg_slot, u32 cs_slot) ++{ ++ if (drm_WARN_ON(&ptdev->base, csg_slot >= MAX_CSGS || cs_slot > MAX_CS_PER_CSG)) ++ return NULL; ++ ++ return &ptdev->fw->iface.streams[csg_slot][cs_slot]; ++} ++ ++/** ++ * panthor_fw_conv_timeout() - Convert a timeout into a cycle-count ++ * @ptdev: Device. ++ * @timeout_us: Timeout expressed in micro-seconds. ++ * ++ * The FW has two timer sources: the GPU counter or arch-timer. We need ++ * to express timeouts in term of number of cycles and specify which ++ * timer source should be used. ++ * ++ * Return: A value suitable for timeout fields in the global interface. ++ */ ++static u32 panthor_fw_conv_timeout(struct panthor_device *ptdev, u32 timeout_us) ++{ ++ bool use_cycle_counter = false; ++ u32 timer_rate = 0; ++ u64 mod_cycles; ++ ++#ifdef CONFIG_ARM_ARCH_TIMER ++ timer_rate = arch_timer_get_cntfrq(); ++#endif ++ ++ if (!timer_rate) { ++ use_cycle_counter = true; ++ timer_rate = clk_get_rate(ptdev->clks.core); ++ } ++ ++ if (drm_WARN_ON(&ptdev->base, !timer_rate)) { ++ /* We couldn't get a valid clock rate, let's just pick the ++ * maximum value so the FW still handles the core ++ * power on/off requests. ++ */ ++ return GLB_TIMER_VAL(~0) | ++ GLB_TIMER_SOURCE_GPU_COUNTER; ++ } ++ ++ mod_cycles = DIV_ROUND_UP_ULL((u64)timeout_us * timer_rate, ++ 1000000ull << 10); ++ if (drm_WARN_ON(&ptdev->base, mod_cycles > GLB_TIMER_VAL(~0))) ++ mod_cycles = GLB_TIMER_VAL(~0); ++ ++ return GLB_TIMER_VAL(mod_cycles) | ++ (use_cycle_counter ? GLB_TIMER_SOURCE_GPU_COUNTER : 0); ++} ++ ++static int panthor_fw_binary_iter_read(struct panthor_device *ptdev, ++ struct panthor_fw_binary_iter *iter, ++ void *out, size_t size) ++{ ++ size_t new_offset = iter->offset + size; ++ ++ if (new_offset > iter->size || new_offset < iter->offset) { ++ drm_err(&ptdev->base, "Firmware too small\n"); ++ return -EINVAL; ++ } ++ ++ memcpy(out, iter->data + iter->offset, size); ++ iter->offset = new_offset; ++ return 0; ++} ++ ++static int panthor_fw_binary_sub_iter_init(struct panthor_device *ptdev, ++ struct panthor_fw_binary_iter *iter, ++ struct panthor_fw_binary_iter *sub_iter, ++ size_t size) ++{ ++ size_t new_offset = iter->offset + size; ++ ++ if (new_offset > iter->size || new_offset < iter->offset) { ++ drm_err(&ptdev->base, "Firmware entry too long\n"); ++ return -EINVAL; ++ } ++ ++ sub_iter->offset = 0; ++ sub_iter->data = iter->data + iter->offset; ++ sub_iter->size = size; ++ iter->offset = new_offset; ++ return 0; ++} ++ ++static void panthor_fw_init_section_mem(struct panthor_device *ptdev, ++ struct panthor_fw_section *section) ++{ ++ bool was_mapped = !!section->mem->kmap; ++ int ret; ++ ++ if (!section->data.size && ++ !(section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO)) ++ return; ++ ++ ret = panthor_kernel_bo_vmap(section->mem); ++ if (drm_WARN_ON(&ptdev->base, ret)) ++ return; ++ ++ memcpy(section->mem->kmap, section->data.buf, section->data.size); ++ if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_ZERO) { ++ memset(section->mem->kmap + section->data.size, 0, ++ panthor_kernel_bo_size(section->mem) - section->data.size); ++ } ++ ++ if (!was_mapped) ++ panthor_kernel_bo_vunmap(section->mem); ++} ++ ++/** ++ * panthor_fw_alloc_queue_iface_mem() - Allocate a ring-buffer interfaces. ++ * @ptdev: Device. ++ * @input: Pointer holding the input interface on success. ++ * Should be ignored on failure. ++ * @output: Pointer holding the output interface on success. ++ * Should be ignored on failure. ++ * @input_fw_va: Pointer holding the input interface FW VA on success. ++ * Should be ignored on failure. ++ * @output_fw_va: Pointer holding the output interface FW VA on success. ++ * Should be ignored on failure. ++ * ++ * Allocates panthor_fw_ringbuf_{input,out}_iface interfaces. The input ++ * interface is at offset 0, and the output interface at offset 4096. ++ * ++ * Return: A valid pointer in case of success, an ERR_PTR() otherwise. ++ */ ++struct panthor_kernel_bo * ++panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev, ++ struct panthor_fw_ringbuf_input_iface **input, ++ const struct panthor_fw_ringbuf_output_iface **output, ++ u32 *input_fw_va, u32 *output_fw_va) ++{ ++ struct panthor_kernel_bo *mem; ++ int ret; ++ ++ mem = panthor_kernel_bo_create(ptdev, ptdev->fw->vm, SZ_8K, ++ DRM_PANTHOR_BO_NO_MMAP, ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | ++ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, ++ PANTHOR_VM_KERNEL_AUTO_VA); ++ if (IS_ERR(mem)) ++ return mem; ++ ++ ret = panthor_kernel_bo_vmap(mem); ++ if (ret) { ++ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), mem); ++ return ERR_PTR(ret); ++ } ++ ++ memset(mem->kmap, 0, panthor_kernel_bo_size(mem)); ++ *input = mem->kmap; ++ *output = mem->kmap + SZ_4K; ++ *input_fw_va = panthor_kernel_bo_gpuva(mem); ++ *output_fw_va = *input_fw_va + SZ_4K; ++ ++ return mem; ++} ++ ++/** ++ * panthor_fw_alloc_suspend_buf_mem() - Allocate a suspend buffer for a command stream group. ++ * @ptdev: Device. ++ * @size: Size of the suspend buffer. ++ * ++ * Return: A valid pointer in case of success, an ERR_PTR() otherwise. ++ */ ++struct panthor_kernel_bo * ++panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size) ++{ ++ return panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev), size, ++ DRM_PANTHOR_BO_NO_MMAP, ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC, ++ PANTHOR_VM_KERNEL_AUTO_VA); ++} ++ ++static int panthor_fw_load_section_entry(struct panthor_device *ptdev, ++ const struct firmware *fw, ++ struct panthor_fw_binary_iter *iter, ++ u32 ehdr) ++{ ++ struct panthor_fw_binary_section_entry_hdr hdr; ++ struct panthor_fw_section *section; ++ u32 section_size; ++ u32 name_len; ++ int ret; ++ ++ ret = panthor_fw_binary_iter_read(ptdev, iter, &hdr, sizeof(hdr)); ++ if (ret) ++ return ret; ++ ++ if (hdr.data.end < hdr.data.start) { ++ drm_err(&ptdev->base, "Firmware corrupted, data.end < data.start (0x%x < 0x%x)\n", ++ hdr.data.end, hdr.data.start); ++ return -EINVAL; ++ } ++ ++ if (hdr.va.end < hdr.va.start) { ++ drm_err(&ptdev->base, "Firmware corrupted, hdr.va.end < hdr.va.start (0x%x < 0x%x)\n", ++ hdr.va.end, hdr.va.start); ++ return -EINVAL; ++ } ++ ++ if (hdr.data.end > fw->size) { ++ drm_err(&ptdev->base, "Firmware corrupted, file truncated? data_end=0x%x > fw size=0x%zx\n", ++ hdr.data.end, fw->size); ++ return -EINVAL; ++ } ++ ++ if ((hdr.va.start & ~PAGE_MASK) != 0 || ++ (hdr.va.end & ~PAGE_MASK) != 0) { ++ drm_err(&ptdev->base, "Firmware corrupted, virtual addresses not page aligned: 0x%x-0x%x\n", ++ hdr.va.start, hdr.va.end); ++ return -EINVAL; ++ } ++ ++ if (hdr.flags & ~CSF_FW_BINARY_IFACE_ENTRY_RD_SUPPORTED_FLAGS) { ++ drm_err(&ptdev->base, "Firmware contains interface with unsupported flags (0x%x)\n", ++ hdr.flags); ++ return -EINVAL; ++ } ++ ++ if (hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_PROT) { ++ drm_warn(&ptdev->base, ++ "Firmware protected mode entry not be supported, ignoring"); ++ return 0; ++ } ++ ++ if (hdr.va.start == CSF_MCU_SHARED_REGION_START && ++ !(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED)) { ++ drm_err(&ptdev->base, ++ "Interface at 0x%llx must be shared", CSF_MCU_SHARED_REGION_START); ++ return -EINVAL; ++ } ++ ++ name_len = iter->size - iter->offset; ++ ++ section = drmm_kzalloc(&ptdev->base, sizeof(*section), GFP_KERNEL); ++ if (!section) ++ return -ENOMEM; ++ ++ list_add_tail(§ion->node, &ptdev->fw->sections); ++ section->flags = hdr.flags; ++ section->data.size = hdr.data.end - hdr.data.start; ++ ++ if (section->data.size > 0) { ++ void *data = drmm_kmalloc(&ptdev->base, section->data.size, GFP_KERNEL); ++ ++ if (!data) ++ return -ENOMEM; ++ ++ memcpy(data, fw->data + hdr.data.start, section->data.size); ++ section->data.buf = data; ++ } ++ ++ if (name_len > 0) { ++ char *name = drmm_kmalloc(&ptdev->base, name_len + 1, GFP_KERNEL); ++ ++ if (!name) ++ return -ENOMEM; ++ ++ memcpy(name, iter->data + iter->offset, name_len); ++ name[name_len] = '\0'; ++ section->name = name; ++ } ++ ++ section_size = hdr.va.end - hdr.va.start; ++ if (section_size) { ++ u32 cache_mode = hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_MASK; ++ struct panthor_gem_object *bo; ++ u32 vm_map_flags = 0; ++ struct sg_table *sgt; ++ u64 va = hdr.va.start; ++ ++ if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR)) ++ vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_READONLY; ++ ++ if (!(hdr.flags & CSF_FW_BINARY_IFACE_ENTRY_RD_EX)) ++ vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC; ++ ++ /* TODO: CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_*_COHERENT are mapped to ++ * non-cacheable for now. We might want to introduce a new ++ * IOMMU_xxx flag (or abuse IOMMU_MMIO, which maps to device ++ * memory and is currently not used by our driver) for ++ * AS_MEMATTR_AARCH64_SHARED memory, so we can take benefit ++ * of IO-coherent systems. ++ */ ++ if (cache_mode != CSF_FW_BINARY_IFACE_ENTRY_RD_CACHE_MODE_CACHED) ++ vm_map_flags |= DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED; ++ ++ section->mem = panthor_kernel_bo_create(ptdev, panthor_fw_vm(ptdev), ++ section_size, ++ DRM_PANTHOR_BO_NO_MMAP, ++ vm_map_flags, va); ++ if (IS_ERR(section->mem)) ++ return PTR_ERR(section->mem); ++ ++ if (drm_WARN_ON(&ptdev->base, section->mem->va_node.start != hdr.va.start)) ++ return -EINVAL; ++ ++ if (section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_SHARED) { ++ ret = panthor_kernel_bo_vmap(section->mem); ++ if (ret) ++ return ret; ++ } ++ ++ panthor_fw_init_section_mem(ptdev, section); ++ ++ bo = to_panthor_bo(section->mem->obj); ++ sgt = drm_gem_shmem_get_pages_sgt(&bo->base); ++ if (IS_ERR(sgt)) ++ return PTR_ERR(sgt); ++ ++ dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE); ++ } ++ ++ if (hdr.va.start == CSF_MCU_SHARED_REGION_START) ++ ptdev->fw->shared_section = section; ++ ++ return 0; ++} ++ ++static void ++panthor_reload_fw_sections(struct panthor_device *ptdev, bool full_reload) ++{ ++ struct panthor_fw_section *section; ++ ++ list_for_each_entry(section, &ptdev->fw->sections, node) { ++ struct sg_table *sgt; ++ ++ if (!full_reload && !(section->flags & CSF_FW_BINARY_IFACE_ENTRY_RD_WR)) ++ continue; ++ ++ panthor_fw_init_section_mem(ptdev, section); ++ sgt = drm_gem_shmem_get_pages_sgt(&to_panthor_bo(section->mem->obj)->base); ++ if (!drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(sgt))) ++ dma_sync_sgtable_for_device(ptdev->base.dev, sgt, DMA_TO_DEVICE); ++ } ++} ++ ++static int panthor_fw_load_entry(struct panthor_device *ptdev, ++ const struct firmware *fw, ++ struct panthor_fw_binary_iter *iter) ++{ ++ struct panthor_fw_binary_iter eiter; ++ u32 ehdr; ++ int ret; ++ ++ ret = panthor_fw_binary_iter_read(ptdev, iter, &ehdr, sizeof(ehdr)); ++ if (ret) ++ return ret; ++ ++ if ((iter->offset % sizeof(u32)) || ++ (CSF_FW_BINARY_ENTRY_SIZE(ehdr) % sizeof(u32))) { ++ drm_err(&ptdev->base, "Firmware entry isn't 32 bit aligned, offset=0x%x size=0x%x\n", ++ (u32)(iter->offset - sizeof(u32)), CSF_FW_BINARY_ENTRY_SIZE(ehdr)); ++ return -EINVAL; ++ } ++ ++ if (panthor_fw_binary_sub_iter_init(ptdev, iter, &eiter, ++ CSF_FW_BINARY_ENTRY_SIZE(ehdr) - sizeof(ehdr))) ++ return -EINVAL; ++ ++ switch (CSF_FW_BINARY_ENTRY_TYPE(ehdr)) { ++ case CSF_FW_BINARY_ENTRY_TYPE_IFACE: ++ return panthor_fw_load_section_entry(ptdev, fw, &eiter, ehdr); ++ ++ /* FIXME: handle those entry types? */ ++ case CSF_FW_BINARY_ENTRY_TYPE_CONFIG: ++ case CSF_FW_BINARY_ENTRY_TYPE_FUTF_TEST: ++ case CSF_FW_BINARY_ENTRY_TYPE_TRACE_BUFFER: ++ case CSF_FW_BINARY_ENTRY_TYPE_TIMELINE_METADATA: ++ return 0; ++ default: ++ break; ++ } ++ ++ if (ehdr & CSF_FW_BINARY_ENTRY_OPTIONAL) ++ return 0; ++ ++ drm_err(&ptdev->base, ++ "Unsupported non-optional entry type %u in firmware\n", ++ CSF_FW_BINARY_ENTRY_TYPE(ehdr)); ++ return -EINVAL; ++} ++ ++static int panthor_fw_load(struct panthor_device *ptdev) ++{ ++ const struct firmware *fw = NULL; ++ struct panthor_fw_binary_iter iter = {}; ++ struct panthor_fw_binary_hdr hdr; ++ char fw_path[128]; ++ int ret; ++ ++ snprintf(fw_path, sizeof(fw_path), "arm/mali/arch%d.%d/%s", ++ (u32)GPU_ARCH_MAJOR(ptdev->gpu_info.gpu_id), ++ (u32)GPU_ARCH_MINOR(ptdev->gpu_info.gpu_id), ++ CSF_FW_NAME); ++ ++ ret = request_firmware(&fw, fw_path, ptdev->base.dev); ++ if (ret) { ++ drm_err(&ptdev->base, "Failed to load firmware image '%s'\n", ++ CSF_FW_NAME); ++ return ret; ++ } ++ ++ iter.data = fw->data; ++ iter.size = fw->size; ++ ret = panthor_fw_binary_iter_read(ptdev, &iter, &hdr, sizeof(hdr)); ++ if (ret) ++ goto out; ++ ++ if (hdr.magic != CSF_FW_BINARY_HEADER_MAGIC) { ++ ret = -EINVAL; ++ drm_err(&ptdev->base, "Invalid firmware magic\n"); ++ goto out; ++ } ++ ++ if (hdr.major != CSF_FW_BINARY_HEADER_MAJOR_MAX) { ++ ret = -EINVAL; ++ drm_err(&ptdev->base, "Unsupported firmware binary header version %d.%d (expected %d.x)\n", ++ hdr.major, hdr.minor, CSF_FW_BINARY_HEADER_MAJOR_MAX); ++ goto out; ++ } ++ ++ if (hdr.size > iter.size) { ++ drm_err(&ptdev->base, "Firmware image is truncated\n"); ++ goto out; ++ } ++ ++ iter.size = hdr.size; ++ ++ while (iter.offset < hdr.size) { ++ ret = panthor_fw_load_entry(ptdev, fw, &iter); ++ if (ret) ++ goto out; ++ } ++ ++ if (!ptdev->fw->shared_section) { ++ drm_err(&ptdev->base, "Shared interface region not found\n"); ++ ret = -EINVAL; ++ goto out; ++ } ++ ++out: ++ release_firmware(fw); ++ return ret; ++} ++ ++/** ++ * iface_fw_to_cpu_addr() - Turn an MCU address into a CPU address ++ * @ptdev: Device. ++ * @mcu_va: MCU address. ++ * ++ * Return: NULL if the address is not part of the shared section, non-NULL otherwise. ++ */ ++static void *iface_fw_to_cpu_addr(struct panthor_device *ptdev, u32 mcu_va) ++{ ++ u64 shared_mem_start = panthor_kernel_bo_gpuva(ptdev->fw->shared_section->mem); ++ u64 shared_mem_end = shared_mem_start + ++ panthor_kernel_bo_size(ptdev->fw->shared_section->mem); ++ if (mcu_va < shared_mem_start || mcu_va >= shared_mem_end) ++ return NULL; ++ ++ return ptdev->fw->shared_section->mem->kmap + (mcu_va - shared_mem_start); ++} ++ ++static int panthor_init_cs_iface(struct panthor_device *ptdev, ++ unsigned int csg_idx, unsigned int cs_idx) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, csg_idx); ++ struct panthor_fw_cs_iface *cs_iface = &ptdev->fw->iface.streams[csg_idx][cs_idx]; ++ u64 shared_section_sz = panthor_kernel_bo_size(ptdev->fw->shared_section->mem); ++ u32 iface_offset = CSF_GROUP_CONTROL_OFFSET + ++ (csg_idx * glb_iface->control->group_stride) + ++ CSF_STREAM_CONTROL_OFFSET + ++ (cs_idx * csg_iface->control->stream_stride); ++ struct panthor_fw_cs_iface *first_cs_iface = ++ panthor_fw_get_cs_iface(ptdev, 0, 0); ++ ++ if (iface_offset + sizeof(*cs_iface) >= shared_section_sz) ++ return -EINVAL; ++ ++ spin_lock_init(&cs_iface->lock); ++ cs_iface->control = ptdev->fw->shared_section->mem->kmap + iface_offset; ++ cs_iface->input = iface_fw_to_cpu_addr(ptdev, cs_iface->control->input_va); ++ cs_iface->output = iface_fw_to_cpu_addr(ptdev, cs_iface->control->output_va); ++ ++ if (!cs_iface->input || !cs_iface->output) { ++ drm_err(&ptdev->base, "Invalid stream control interface input/output VA"); ++ return -EINVAL; ++ } ++ ++ if (cs_iface != first_cs_iface) { ++ if (cs_iface->control->features != first_cs_iface->control->features) { ++ drm_err(&ptdev->base, "Expecting identical CS slots"); ++ return -EINVAL; ++ } ++ } else { ++ u32 reg_count = CS_FEATURES_WORK_REGS(cs_iface->control->features); ++ ++ ptdev->csif_info.cs_reg_count = reg_count; ++ ptdev->csif_info.unpreserved_cs_reg_count = CSF_UNPRESERVED_REG_COUNT; ++ } ++ ++ return 0; ++} ++ ++static bool compare_csg(const struct panthor_fw_csg_control_iface *a, ++ const struct panthor_fw_csg_control_iface *b) ++{ ++ if (a->features != b->features) ++ return false; ++ if (a->suspend_size != b->suspend_size) ++ return false; ++ if (a->protm_suspend_size != b->protm_suspend_size) ++ return false; ++ if (a->stream_num != b->stream_num) ++ return false; ++ return true; ++} ++ ++static int panthor_init_csg_iface(struct panthor_device *ptdev, ++ unsigned int csg_idx) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ struct panthor_fw_csg_iface *csg_iface = &ptdev->fw->iface.groups[csg_idx]; ++ u64 shared_section_sz = panthor_kernel_bo_size(ptdev->fw->shared_section->mem); ++ u32 iface_offset = CSF_GROUP_CONTROL_OFFSET + (csg_idx * glb_iface->control->group_stride); ++ unsigned int i; ++ ++ if (iface_offset + sizeof(*csg_iface) >= shared_section_sz) ++ return -EINVAL; ++ ++ spin_lock_init(&csg_iface->lock); ++ csg_iface->control = ptdev->fw->shared_section->mem->kmap + iface_offset; ++ csg_iface->input = iface_fw_to_cpu_addr(ptdev, csg_iface->control->input_va); ++ csg_iface->output = iface_fw_to_cpu_addr(ptdev, csg_iface->control->output_va); ++ ++ if (csg_iface->control->stream_num < MIN_CS_PER_CSG || ++ csg_iface->control->stream_num > MAX_CS_PER_CSG) ++ return -EINVAL; ++ ++ if (!csg_iface->input || !csg_iface->output) { ++ drm_err(&ptdev->base, "Invalid group control interface input/output VA"); ++ return -EINVAL; ++ } ++ ++ if (csg_idx > 0) { ++ struct panthor_fw_csg_iface *first_csg_iface = ++ panthor_fw_get_csg_iface(ptdev, 0); ++ ++ if (!compare_csg(first_csg_iface->control, csg_iface->control)) { ++ drm_err(&ptdev->base, "Expecting identical CSG slots"); ++ return -EINVAL; ++ } ++ } ++ ++ for (i = 0; i < csg_iface->control->stream_num; i++) { ++ int ret = panthor_init_cs_iface(ptdev, csg_idx, i); ++ ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static u32 panthor_get_instr_features(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ ++ if (glb_iface->control->version < CSF_IFACE_VERSION(1, 1, 0)) ++ return 0; ++ ++ return glb_iface->control->instr_features; ++} ++ ++static int panthor_fw_init_ifaces(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_global_iface *glb_iface = &ptdev->fw->iface.global; ++ unsigned int i; ++ ++ if (!ptdev->fw->shared_section->mem->kmap) ++ return -EINVAL; ++ ++ spin_lock_init(&glb_iface->lock); ++ glb_iface->control = ptdev->fw->shared_section->mem->kmap; ++ ++ if (!glb_iface->control->version) { ++ drm_err(&ptdev->base, "Firmware version is 0. Firmware may have failed to boot"); ++ return -EINVAL; ++ } ++ ++ glb_iface->input = iface_fw_to_cpu_addr(ptdev, glb_iface->control->input_va); ++ glb_iface->output = iface_fw_to_cpu_addr(ptdev, glb_iface->control->output_va); ++ if (!glb_iface->input || !glb_iface->output) { ++ drm_err(&ptdev->base, "Invalid global control interface input/output VA"); ++ return -EINVAL; ++ } ++ ++ if (glb_iface->control->group_num > MAX_CSGS || ++ glb_iface->control->group_num < MIN_CSGS) { ++ drm_err(&ptdev->base, "Invalid number of control groups"); ++ return -EINVAL; ++ } ++ ++ for (i = 0; i < glb_iface->control->group_num; i++) { ++ int ret = panthor_init_csg_iface(ptdev, i); ++ ++ if (ret) ++ return ret; ++ } ++ ++ drm_info(&ptdev->base, "CSF FW v%d.%d.%d, Features %#x Instrumentation features %#x", ++ CSF_IFACE_VERSION_MAJOR(glb_iface->control->version), ++ CSF_IFACE_VERSION_MINOR(glb_iface->control->version), ++ CSF_IFACE_VERSION_PATCH(glb_iface->control->version), ++ glb_iface->control->features, ++ panthor_get_instr_features(ptdev)); ++ return 0; ++} ++ ++static void panthor_fw_init_global_iface(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ ++ /* Enable all cores. */ ++ glb_iface->input->core_en_mask = ptdev->gpu_info.shader_present; ++ ++ /* Setup timers. */ ++ glb_iface->input->poweroff_timer = panthor_fw_conv_timeout(ptdev, PWROFF_HYSTERESIS_US); ++ glb_iface->input->progress_timer = PROGRESS_TIMEOUT_CYCLES >> PROGRESS_TIMEOUT_SCALE_SHIFT; ++ glb_iface->input->idle_timer = panthor_fw_conv_timeout(ptdev, IDLE_HYSTERESIS_US); ++ ++ /* Enable interrupts we care about. */ ++ glb_iface->input->ack_irq_mask = GLB_CFG_ALLOC_EN | ++ GLB_PING | ++ GLB_CFG_PROGRESS_TIMER | ++ GLB_CFG_POWEROFF_TIMER | ++ GLB_IDLE_EN | ++ GLB_IDLE; ++ ++ panthor_fw_update_reqs(glb_iface, req, GLB_IDLE_EN, GLB_IDLE_EN); ++ panthor_fw_toggle_reqs(glb_iface, req, ack, ++ GLB_CFG_ALLOC_EN | ++ GLB_CFG_POWEROFF_TIMER | ++ GLB_CFG_PROGRESS_TIMER); ++ ++ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1); ++ ++ /* Kick the watchdog. */ ++ mod_delayed_work(ptdev->reset.wq, &ptdev->fw->watchdog.ping_work, ++ msecs_to_jiffies(PING_INTERVAL_MS)); ++} ++ ++static void panthor_job_irq_handler(struct panthor_device *ptdev, u32 status) ++{ ++ if (!ptdev->fw->booted && (status & JOB_INT_GLOBAL_IF)) ++ ptdev->fw->booted = true; ++ ++ wake_up_all(&ptdev->fw->req_waitqueue); ++ ++ /* If the FW is not booted, don't process IRQs, just flag the FW as booted. */ ++ if (!ptdev->fw->booted) ++ return; ++ ++ panthor_sched_report_fw_events(ptdev, status); ++} ++PANTHOR_IRQ_HANDLER(job, JOB, panthor_job_irq_handler); ++ ++static int panthor_fw_start(struct panthor_device *ptdev) ++{ ++ bool timedout = false; ++ ++ ptdev->fw->booted = false; ++ panthor_job_irq_resume(&ptdev->fw->irq, ~0); ++ gpu_write(ptdev, MCU_CONTROL, MCU_CONTROL_AUTO); ++ ++ if (!wait_event_timeout(ptdev->fw->req_waitqueue, ++ ptdev->fw->booted, ++ msecs_to_jiffies(1000))) { ++ if (!ptdev->fw->booted && ++ !(gpu_read(ptdev, JOB_INT_STAT) & JOB_INT_GLOBAL_IF)) ++ timedout = true; ++ } ++ ++ if (timedout) { ++ static const char * const status_str[] = { ++ [MCU_STATUS_DISABLED] = "disabled", ++ [MCU_STATUS_ENABLED] = "enabled", ++ [MCU_STATUS_HALT] = "halt", ++ [MCU_STATUS_FATAL] = "fatal", ++ }; ++ u32 status = gpu_read(ptdev, MCU_STATUS); ++ ++ drm_err(&ptdev->base, "Failed to boot MCU (status=%s)", ++ status < ARRAY_SIZE(status_str) ? status_str[status] : "unknown"); ++ return -ETIMEDOUT; ++ } ++ ++ return 0; ++} ++ ++static void panthor_fw_stop(struct panthor_device *ptdev) ++{ ++ u32 status; ++ ++ gpu_write(ptdev, MCU_CONTROL, MCU_CONTROL_DISABLE); ++ if (readl_poll_timeout(ptdev->iomem + MCU_STATUS, status, ++ status == MCU_STATUS_DISABLED, 10, 100000)) ++ drm_err(&ptdev->base, "Failed to stop MCU"); ++} ++ ++/** ++ * panthor_fw_pre_reset() - Call before a reset. ++ * @ptdev: Device. ++ * @on_hang: true if the reset was triggered on a GPU hang. ++ * ++ * If the reset is not triggered on a hang, we try to gracefully halt the ++ * MCU, so we can do a fast-reset when panthor_fw_post_reset() is called. ++ */ ++void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang) ++{ ++ /* Make sure we won't be woken up by a ping. */ ++ cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work); ++ ++ ptdev->fw->fast_reset = false; ++ ++ if (!on_hang) { ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ u32 status; ++ ++ panthor_fw_update_reqs(glb_iface, req, GLB_HALT, GLB_HALT); ++ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1); ++ if (!readl_poll_timeout(ptdev->iomem + MCU_STATUS, status, ++ status == MCU_STATUS_HALT, 10, 100000) && ++ glb_iface->output->halt_status == PANTHOR_FW_HALT_OK) { ++ ptdev->fw->fast_reset = true; ++ } else { ++ drm_warn(&ptdev->base, "Failed to cleanly suspend MCU"); ++ } ++ ++ /* The FW detects 0 -> 1 transitions. Make sure we reset ++ * the HALT bit before the FW is rebooted. ++ */ ++ panthor_fw_update_reqs(glb_iface, req, 0, GLB_HALT); ++ } ++ ++ panthor_job_irq_suspend(&ptdev->fw->irq); ++} ++ ++/** ++ * panthor_fw_post_reset() - Call after a reset. ++ * @ptdev: Device. ++ * ++ * Start the FW. If this is not a fast reset, all FW sections are reloaded to ++ * make sure we can recover from a memory corruption. ++ */ ++int panthor_fw_post_reset(struct panthor_device *ptdev) ++{ ++ int ret; ++ ++ /* Make the MCU VM active. */ ++ ret = panthor_vm_active(ptdev->fw->vm); ++ if (ret) ++ return ret; ++ ++ /* If this is a fast reset, try to start the MCU without reloading ++ * the FW sections. If it fails, go for a full reset. ++ */ ++ if (ptdev->fw->fast_reset) { ++ ret = panthor_fw_start(ptdev); ++ if (!ret) ++ goto out; ++ ++ /* Force a disable, so we get a fresh boot on the next ++ * panthor_fw_start() call. ++ */ ++ gpu_write(ptdev, MCU_CONTROL, MCU_CONTROL_DISABLE); ++ drm_err(&ptdev->base, "FW fast reset failed, trying a slow reset"); ++ } ++ ++ /* Reload all sections, including RO ones. We're not supposed ++ * to end up here anyway, let's just assume the overhead of ++ * reloading everything is acceptable. ++ */ ++ panthor_reload_fw_sections(ptdev, true); ++ ++ ret = panthor_fw_start(ptdev); ++ if (ret) { ++ drm_err(&ptdev->base, "FW slow reset failed"); ++ return ret; ++ } ++ ++out: ++ /* We must re-initialize the global interface even on fast-reset. */ ++ panthor_fw_init_global_iface(ptdev); ++ return 0; ++} ++ ++/** ++ * panthor_fw_unplug() - Called when the device is unplugged. ++ * @ptdev: Device. ++ * ++ * This function must make sure all pending operations are flushed before ++ * will release device resources, thus preventing any interaction with ++ * the HW. ++ * ++ * If there is still FW-related work running after this function returns, ++ * they must use drm_dev_{enter,exit}() and skip any HW access when ++ * drm_dev_enter() returns false. ++ */ ++void panthor_fw_unplug(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_section *section; ++ ++ cancel_delayed_work_sync(&ptdev->fw->watchdog.ping_work); ++ ++ /* Make sure the IRQ handler can be called after that point. */ ++ if (ptdev->fw->irq.irq) ++ panthor_job_irq_suspend(&ptdev->fw->irq); ++ ++ panthor_fw_stop(ptdev); ++ ++ list_for_each_entry(section, &ptdev->fw->sections, node) ++ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), section->mem); ++ ++ /* We intentionally don't call panthor_vm_idle() and let ++ * panthor_mmu_unplug() release the AS we acquired with ++ * panthor_vm_active() so we don't have to track the VM active/idle ++ * state to keep the active_refcnt balanced. ++ */ ++ panthor_vm_put(ptdev->fw->vm); ++ ++ panthor_gpu_power_off(ptdev, L2, ptdev->gpu_info.l2_present, 20000); ++} ++ ++/** ++ * panthor_fw_wait_acks() - Wait for requests to be acknowledged by the FW. ++ * @req_ptr: Pointer to the req register. ++ * @ack_ptr: Pointer to the ack register. ++ * @wq: Wait queue to use for the sleeping wait. ++ * @req_mask: Mask of requests to wait for. ++ * @acked: Pointer to field that's updated with the acked requests. ++ * If the function returns 0, *acked == req_mask. ++ * @timeout_ms: Timeout expressed in milliseconds. ++ * ++ * Return: 0 on success, -ETIMEDOUT otherwise. ++ */ ++static int panthor_fw_wait_acks(const u32 *req_ptr, const u32 *ack_ptr, ++ wait_queue_head_t *wq, ++ u32 req_mask, u32 *acked, ++ u32 timeout_ms) ++{ ++ u32 ack, req = READ_ONCE(*req_ptr) & req_mask; ++ int ret; ++ ++ /* Busy wait for a few µsecs before falling back to a sleeping wait. */ ++ *acked = req_mask; ++ ret = read_poll_timeout_atomic(READ_ONCE, ack, ++ (ack & req_mask) == req, ++ 0, 10, 0, ++ *ack_ptr); ++ if (!ret) ++ return 0; ++ ++ if (wait_event_timeout(*wq, (READ_ONCE(*ack_ptr) & req_mask) == req, ++ msecs_to_jiffies(timeout_ms))) ++ return 0; ++ ++ /* Check one last time, in case we were not woken up for some reason. */ ++ ack = READ_ONCE(*ack_ptr); ++ if ((ack & req_mask) == req) ++ return 0; ++ ++ *acked = ~(req ^ ack) & req_mask; ++ return -ETIMEDOUT; ++} ++ ++/** ++ * panthor_fw_glb_wait_acks() - Wait for global requests to be acknowledged. ++ * @ptdev: Device. ++ * @req_mask: Mask of requests to wait for. ++ * @acked: Pointer to field that's updated with the acked requests. ++ * If the function returns 0, *acked == req_mask. ++ * @timeout_ms: Timeout expressed in milliseconds. ++ * ++ * Return: 0 on success, -ETIMEDOUT otherwise. ++ */ ++int panthor_fw_glb_wait_acks(struct panthor_device *ptdev, ++ u32 req_mask, u32 *acked, ++ u32 timeout_ms) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ ++ /* GLB_HALT doesn't get acked through the FW interface. */ ++ if (drm_WARN_ON(&ptdev->base, req_mask & (~GLB_REQ_MASK | GLB_HALT))) ++ return -EINVAL; ++ ++ return panthor_fw_wait_acks(&glb_iface->input->req, ++ &glb_iface->output->ack, ++ &ptdev->fw->req_waitqueue, ++ req_mask, acked, timeout_ms); ++} ++ ++/** ++ * panthor_fw_csg_wait_acks() - Wait for command stream group requests to be acknowledged. ++ * @ptdev: Device. ++ * @csg_slot: CSG slot ID. ++ * @req_mask: Mask of requests to wait for. ++ * @acked: Pointer to field that's updated with the acked requests. ++ * If the function returns 0, *acked == req_mask. ++ * @timeout_ms: Timeout expressed in milliseconds. ++ * ++ * Return: 0 on success, -ETIMEDOUT otherwise. ++ */ ++int panthor_fw_csg_wait_acks(struct panthor_device *ptdev, u32 csg_slot, ++ u32 req_mask, u32 *acked, u32 timeout_ms) ++{ ++ struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, csg_slot); ++ int ret; ++ ++ if (drm_WARN_ON(&ptdev->base, req_mask & ~CSG_REQ_MASK)) ++ return -EINVAL; ++ ++ ret = panthor_fw_wait_acks(&csg_iface->input->req, ++ &csg_iface->output->ack, ++ &ptdev->fw->req_waitqueue, ++ req_mask, acked, timeout_ms); ++ ++ /* ++ * Check that all bits in the state field were updated, if any mismatch ++ * then clear all bits in the state field. This allows code to do ++ * (acked & CSG_STATE_MASK) and get the right value. ++ */ ++ ++ if ((*acked & CSG_STATE_MASK) != CSG_STATE_MASK) ++ *acked &= ~CSG_STATE_MASK; ++ ++ return ret; ++} ++ ++/** ++ * panthor_fw_ring_csg_doorbells() - Ring command stream group doorbells. ++ * @ptdev: Device. ++ * @csg_mask: Bitmask encoding the command stream group doorbells to ring. ++ * ++ * This function is toggling bits in the doorbell_req and ringing the ++ * global doorbell. It doesn't require a user doorbell to be attached to ++ * the group. ++ */ ++void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_mask) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ ++ panthor_fw_toggle_reqs(glb_iface, doorbell_req, doorbell_ack, csg_mask); ++ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1); ++} ++ ++static void panthor_fw_ping_work(struct work_struct *work) ++{ ++ struct panthor_fw *fw = container_of(work, struct panthor_fw, watchdog.ping_work.work); ++ struct panthor_device *ptdev = fw->irq.ptdev; ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ u32 acked; ++ int ret; ++ ++ if (panthor_device_reset_is_pending(ptdev)) ++ return; ++ ++ panthor_fw_toggle_reqs(glb_iface, req, ack, GLB_PING); ++ gpu_write(ptdev, CSF_DOORBELL(CSF_GLB_DOORBELL_ID), 1); ++ ++ ret = panthor_fw_glb_wait_acks(ptdev, GLB_PING, &acked, 100); ++ if (ret) { ++ panthor_device_schedule_reset(ptdev); ++ drm_err(&ptdev->base, "FW ping timeout, scheduling a reset"); ++ } else { ++ mod_delayed_work(ptdev->reset.wq, &fw->watchdog.ping_work, ++ msecs_to_jiffies(PING_INTERVAL_MS)); ++ } ++} ++ ++/** ++ * panthor_fw_init() - Initialize FW related data. ++ * @ptdev: Device. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++int panthor_fw_init(struct panthor_device *ptdev) ++{ ++ struct panthor_fw *fw; ++ int ret, irq; ++ ++ fw = drmm_kzalloc(&ptdev->base, sizeof(*fw), GFP_KERNEL); ++ if (!fw) ++ return -ENOMEM; ++ ++ ptdev->fw = fw; ++ init_waitqueue_head(&fw->req_waitqueue); ++ INIT_LIST_HEAD(&fw->sections); ++ INIT_DELAYED_WORK(&fw->watchdog.ping_work, panthor_fw_ping_work); ++ ++ irq = platform_get_irq_byname(to_platform_device(ptdev->base.dev), "job"); ++ if (irq <= 0) ++ return -ENODEV; ++ ++ ret = panthor_request_job_irq(ptdev, &fw->irq, irq, 0); ++ if (ret) { ++ drm_err(&ptdev->base, "failed to request job irq"); ++ return ret; ++ } ++ ++ ret = panthor_gpu_l2_power_on(ptdev); ++ if (ret) ++ return ret; ++ ++ fw->vm = panthor_vm_create(ptdev, true, ++ 0, SZ_4G, ++ CSF_MCU_SHARED_REGION_START, ++ CSF_MCU_SHARED_REGION_SIZE); ++ if (IS_ERR(fw->vm)) { ++ ret = PTR_ERR(fw->vm); ++ fw->vm = NULL; ++ goto err_unplug_fw; ++ } ++ ++ ret = panthor_fw_load(ptdev); ++ if (ret) ++ goto err_unplug_fw; ++ ++ ret = panthor_vm_active(fw->vm); ++ if (ret) ++ goto err_unplug_fw; ++ ++ ret = panthor_fw_start(ptdev); ++ if (ret) ++ goto err_unplug_fw; ++ ++ ret = panthor_fw_init_ifaces(ptdev); ++ if (ret) ++ goto err_unplug_fw; ++ ++ panthor_fw_init_global_iface(ptdev); ++ return 0; ++ ++err_unplug_fw: ++ panthor_fw_unplug(ptdev); ++ return ret; ++} ++ ++MODULE_FIRMWARE("arm/mali/arch10.8/mali_csffw.bin"); +diff --git a/drivers/gpu/drm/panthor/panthor_fw.h b/drivers/gpu/drm/panthor/panthor_fw.h +new file mode 100644 +index 000000000000..22448abde992 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_fw.h +@@ -0,0 +1,503 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifndef __PANTHOR_MCU_H__ ++#define __PANTHOR_MCU_H__ ++ ++#include ++ ++struct panthor_device; ++struct panthor_kernel_bo; ++ ++#define MAX_CSGS 31 ++#define MAX_CS_PER_CSG 32 ++ ++struct panthor_fw_ringbuf_input_iface { ++ u64 insert; ++ u64 extract; ++}; ++ ++struct panthor_fw_ringbuf_output_iface { ++ u64 extract; ++ u32 active; ++}; ++ ++struct panthor_fw_cs_control_iface { ++#define CS_FEATURES_WORK_REGS(x) (((x) & GENMASK(7, 0)) + 1) ++#define CS_FEATURES_SCOREBOARDS(x) (((x) & GENMASK(15, 8)) >> 8) ++#define CS_FEATURES_COMPUTE BIT(16) ++#define CS_FEATURES_FRAGMENT BIT(17) ++#define CS_FEATURES_TILER BIT(18) ++ u32 features; ++ u32 input_va; ++ u32 output_va; ++}; ++ ++struct panthor_fw_cs_input_iface { ++#define CS_STATE_MASK GENMASK(2, 0) ++#define CS_STATE_STOP 0 ++#define CS_STATE_START 1 ++#define CS_EXTRACT_EVENT BIT(4) ++#define CS_IDLE_SYNC_WAIT BIT(8) ++#define CS_IDLE_PROTM_PENDING BIT(9) ++#define CS_IDLE_EMPTY BIT(10) ++#define CS_IDLE_RESOURCE_REQ BIT(11) ++#define CS_TILER_OOM BIT(26) ++#define CS_PROTM_PENDING BIT(27) ++#define CS_FATAL BIT(30) ++#define CS_FAULT BIT(31) ++#define CS_REQ_MASK (CS_STATE_MASK | \ ++ CS_EXTRACT_EVENT | \ ++ CS_IDLE_SYNC_WAIT | \ ++ CS_IDLE_PROTM_PENDING | \ ++ CS_IDLE_EMPTY | \ ++ CS_IDLE_RESOURCE_REQ) ++#define CS_EVT_MASK (CS_TILER_OOM | \ ++ CS_PROTM_PENDING | \ ++ CS_FATAL | \ ++ CS_FAULT) ++ u32 req; ++ ++#define CS_CONFIG_PRIORITY(x) ((x) & GENMASK(3, 0)) ++#define CS_CONFIG_DOORBELL(x) (((x) << 8) & GENMASK(15, 8)) ++ u32 config; ++ u32 reserved1; ++ u32 ack_irq_mask; ++ u64 ringbuf_base; ++ u32 ringbuf_size; ++ u32 reserved2; ++ u64 heap_start; ++ u64 heap_end; ++ u64 ringbuf_input; ++ u64 ringbuf_output; ++ u32 instr_config; ++ u32 instrbuf_size; ++ u64 instrbuf_base; ++ u64 instrbuf_offset_ptr; ++}; ++ ++struct panthor_fw_cs_output_iface { ++ u32 ack; ++ u32 reserved1[15]; ++ u64 status_cmd_ptr; ++ ++#define CS_STATUS_WAIT_SB_MASK GENMASK(15, 0) ++#define CS_STATUS_WAIT_SB_SRC_MASK GENMASK(19, 16) ++#define CS_STATUS_WAIT_SB_SRC_NONE (0 << 16) ++#define CS_STATUS_WAIT_SB_SRC_WAIT (8 << 16) ++#define CS_STATUS_WAIT_SYNC_COND_LE (0 << 24) ++#define CS_STATUS_WAIT_SYNC_COND_GT (1 << 24) ++#define CS_STATUS_WAIT_SYNC_COND_MASK GENMASK(27, 24) ++#define CS_STATUS_WAIT_PROGRESS BIT(28) ++#define CS_STATUS_WAIT_PROTM BIT(29) ++#define CS_STATUS_WAIT_SYNC_64B BIT(30) ++#define CS_STATUS_WAIT_SYNC BIT(31) ++ u32 status_wait; ++ u32 status_req_resource; ++ u64 status_wait_sync_ptr; ++ u32 status_wait_sync_value; ++ u32 status_scoreboards; ++ ++#define CS_STATUS_BLOCKED_REASON_UNBLOCKED 0 ++#define CS_STATUS_BLOCKED_REASON_SB_WAIT 1 ++#define CS_STATUS_BLOCKED_REASON_PROGRESS_WAIT 2 ++#define CS_STATUS_BLOCKED_REASON_SYNC_WAIT 3 ++#define CS_STATUS_BLOCKED_REASON_DEFERRED 5 ++#define CS_STATUS_BLOCKED_REASON_RES 6 ++#define CS_STATUS_BLOCKED_REASON_FLUSH 7 ++#define CS_STATUS_BLOCKED_REASON_MASK GENMASK(3, 0) ++ u32 status_blocked_reason; ++ u32 status_wait_sync_value_hi; ++ u32 reserved2[6]; ++ ++#define CS_EXCEPTION_TYPE(x) ((x) & GENMASK(7, 0)) ++#define CS_EXCEPTION_DATA(x) (((x) >> 8) & GENMASK(23, 0)) ++ u32 fault; ++ u32 fatal; ++ u64 fault_info; ++ u64 fatal_info; ++ u32 reserved3[10]; ++ u32 heap_vt_start; ++ u32 heap_vt_end; ++ u32 reserved4; ++ u32 heap_frag_end; ++ u64 heap_address; ++}; ++ ++struct panthor_fw_csg_control_iface { ++ u32 features; ++ u32 input_va; ++ u32 output_va; ++ u32 suspend_size; ++ u32 protm_suspend_size; ++ u32 stream_num; ++ u32 stream_stride; ++}; ++ ++struct panthor_fw_csg_input_iface { ++#define CSG_STATE_MASK GENMASK(2, 0) ++#define CSG_STATE_TERMINATE 0 ++#define CSG_STATE_START 1 ++#define CSG_STATE_SUSPEND 2 ++#define CSG_STATE_RESUME 3 ++#define CSG_ENDPOINT_CONFIG BIT(4) ++#define CSG_STATUS_UPDATE BIT(5) ++#define CSG_SYNC_UPDATE BIT(28) ++#define CSG_IDLE BIT(29) ++#define CSG_DOORBELL BIT(30) ++#define CSG_PROGRESS_TIMER_EVENT BIT(31) ++#define CSG_REQ_MASK (CSG_STATE_MASK | \ ++ CSG_ENDPOINT_CONFIG | \ ++ CSG_STATUS_UPDATE) ++#define CSG_EVT_MASK (CSG_SYNC_UPDATE | \ ++ CSG_IDLE | \ ++ CSG_PROGRESS_TIMER_EVENT) ++ u32 req; ++ u32 ack_irq_mask; ++ ++ u32 doorbell_req; ++ u32 cs_irq_ack; ++ u32 reserved1[4]; ++ u64 allow_compute; ++ u64 allow_fragment; ++ u32 allow_other; ++ ++#define CSG_EP_REQ_COMPUTE(x) ((x) & GENMASK(7, 0)) ++#define CSG_EP_REQ_FRAGMENT(x) (((x) << 8) & GENMASK(15, 8)) ++#define CSG_EP_REQ_TILER(x) (((x) << 16) & GENMASK(19, 16)) ++#define CSG_EP_REQ_EXCL_COMPUTE BIT(20) ++#define CSG_EP_REQ_EXCL_FRAGMENT BIT(21) ++#define CSG_EP_REQ_PRIORITY(x) (((x) << 28) & GENMASK(31, 28)) ++#define CSG_EP_REQ_PRIORITY_MASK GENMASK(31, 28) ++ u32 endpoint_req; ++ u32 reserved2[2]; ++ u64 suspend_buf; ++ u64 protm_suspend_buf; ++ u32 config; ++ u32 iter_trace_config; ++}; ++ ++struct panthor_fw_csg_output_iface { ++ u32 ack; ++ u32 reserved1; ++ u32 doorbell_ack; ++ u32 cs_irq_req; ++ u32 status_endpoint_current; ++ u32 status_endpoint_req; ++ ++#define CSG_STATUS_STATE_IS_IDLE BIT(0) ++ u32 status_state; ++ u32 resource_dep; ++}; ++ ++struct panthor_fw_global_control_iface { ++ u32 version; ++ u32 features; ++ u32 input_va; ++ u32 output_va; ++ u32 group_num; ++ u32 group_stride; ++ u32 perfcnt_size; ++ u32 instr_features; ++}; ++ ++struct panthor_fw_global_input_iface { ++#define GLB_HALT BIT(0) ++#define GLB_CFG_PROGRESS_TIMER BIT(1) ++#define GLB_CFG_ALLOC_EN BIT(2) ++#define GLB_CFG_POWEROFF_TIMER BIT(3) ++#define GLB_PROTM_ENTER BIT(4) ++#define GLB_PERFCNT_EN BIT(5) ++#define GLB_PERFCNT_SAMPLE BIT(6) ++#define GLB_COUNTER_EN BIT(7) ++#define GLB_PING BIT(8) ++#define GLB_FWCFG_UPDATE BIT(9) ++#define GLB_IDLE_EN BIT(10) ++#define GLB_SLEEP BIT(12) ++#define GLB_INACTIVE_COMPUTE BIT(20) ++#define GLB_INACTIVE_FRAGMENT BIT(21) ++#define GLB_INACTIVE_TILER BIT(22) ++#define GLB_PROTM_EXIT BIT(23) ++#define GLB_PERFCNT_THRESHOLD BIT(24) ++#define GLB_PERFCNT_OVERFLOW BIT(25) ++#define GLB_IDLE BIT(26) ++#define GLB_DBG_CSF BIT(30) ++#define GLB_DBG_HOST BIT(31) ++#define GLB_REQ_MASK GENMASK(10, 0) ++#define GLB_EVT_MASK GENMASK(26, 20) ++ u32 req; ++ u32 ack_irq_mask; ++ u32 doorbell_req; ++ u32 reserved1; ++ u32 progress_timer; ++ ++#define GLB_TIMER_VAL(x) ((x) & GENMASK(30, 0)) ++#define GLB_TIMER_SOURCE_GPU_COUNTER BIT(31) ++ u32 poweroff_timer; ++ u64 core_en_mask; ++ u32 reserved2; ++ u32 perfcnt_as; ++ u64 perfcnt_base; ++ u32 perfcnt_extract; ++ u32 reserved3[3]; ++ u32 perfcnt_config; ++ u32 perfcnt_csg_select; ++ u32 perfcnt_fw_enable; ++ u32 perfcnt_csg_enable; ++ u32 perfcnt_csf_enable; ++ u32 perfcnt_shader_enable; ++ u32 perfcnt_tiler_enable; ++ u32 perfcnt_mmu_l2_enable; ++ u32 reserved4[8]; ++ u32 idle_timer; ++}; ++ ++enum panthor_fw_halt_status { ++ PANTHOR_FW_HALT_OK = 0, ++ PANTHOR_FW_HALT_ON_PANIC = 0x4e, ++ PANTHOR_FW_HALT_ON_WATCHDOG_EXPIRATION = 0x4f, ++}; ++ ++struct panthor_fw_global_output_iface { ++ u32 ack; ++ u32 reserved1; ++ u32 doorbell_ack; ++ u32 reserved2; ++ u32 halt_status; ++ u32 perfcnt_status; ++ u32 perfcnt_insert; ++}; ++ ++/** ++ * struct panthor_fw_cs_iface - Firmware command stream slot interface ++ */ ++struct panthor_fw_cs_iface { ++ /** ++ * @lock: Lock protecting access to the panthor_fw_cs_input_iface::req ++ * field. ++ * ++ * Needed so we can update the req field concurrently from the interrupt ++ * handler and the scheduler logic. ++ * ++ * TODO: Ideally we'd want to use a cmpxchg() to update the req, but FW ++ * interface sections are mapped uncached/write-combined right now, and ++ * using cmpxchg() on such mappings leads to SError faults. Revisit when ++ * we have 'SHARED' GPU mappings hooked up. ++ */ ++ spinlock_t lock; ++ ++ /** ++ * @control: Command stream slot control interface. ++ * ++ * Used to expose command stream slot properties. ++ * ++ * This interface is read-only. ++ */ ++ struct panthor_fw_cs_control_iface *control; ++ ++ /** ++ * @input: Command stream slot input interface. ++ * ++ * Used for host updates/events. ++ */ ++ struct panthor_fw_cs_input_iface *input; ++ ++ /** ++ * @output: Command stream slot output interface. ++ * ++ * Used for FW updates/events. ++ * ++ * This interface is read-only. ++ */ ++ const struct panthor_fw_cs_output_iface *output; ++}; ++ ++/** ++ * struct panthor_fw_csg_iface - Firmware command stream group slot interface ++ */ ++struct panthor_fw_csg_iface { ++ /** ++ * @lock: Lock protecting access to the panthor_fw_csg_input_iface::req ++ * field. ++ * ++ * Needed so we can update the req field concurrently from the interrupt ++ * handler and the scheduler logic. ++ * ++ * TODO: Ideally we'd want to use a cmpxchg() to update the req, but FW ++ * interface sections are mapped uncached/write-combined right now, and ++ * using cmpxchg() on such mappings leads to SError faults. Revisit when ++ * we have 'SHARED' GPU mappings hooked up. ++ */ ++ spinlock_t lock; ++ ++ /** ++ * @control: Command stream group slot control interface. ++ * ++ * Used to expose command stream group slot properties. ++ * ++ * This interface is read-only. ++ */ ++ const struct panthor_fw_csg_control_iface *control; ++ ++ /** ++ * @input: Command stream slot input interface. ++ * ++ * Used for host updates/events. ++ */ ++ struct panthor_fw_csg_input_iface *input; ++ ++ /** ++ * @output: Command stream group slot output interface. ++ * ++ * Used for FW updates/events. ++ * ++ * This interface is read-only. ++ */ ++ const struct panthor_fw_csg_output_iface *output; ++}; ++ ++/** ++ * struct panthor_fw_global_iface - Firmware global interface ++ */ ++struct panthor_fw_global_iface { ++ /** ++ * @lock: Lock protecting access to the panthor_fw_global_input_iface::req ++ * field. ++ * ++ * Needed so we can update the req field concurrently from the interrupt ++ * handler and the scheduler/FW management logic. ++ * ++ * TODO: Ideally we'd want to use a cmpxchg() to update the req, but FW ++ * interface sections are mapped uncached/write-combined right now, and ++ * using cmpxchg() on such mappings leads to SError faults. Revisit when ++ * we have 'SHARED' GPU mappings hooked up. ++ */ ++ spinlock_t lock; ++ ++ /** ++ * @control: Command stream group slot control interface. ++ * ++ * Used to expose global FW properties. ++ * ++ * This interface is read-only. ++ */ ++ const struct panthor_fw_global_control_iface *control; ++ ++ /** ++ * @input: Global input interface. ++ * ++ * Used for host updates/events. ++ */ ++ struct panthor_fw_global_input_iface *input; ++ ++ /** ++ * @output: Global output interface. ++ * ++ * Used for FW updates/events. ++ * ++ * This interface is read-only. ++ */ ++ const struct panthor_fw_global_output_iface *output; ++}; ++ ++/** ++ * panthor_fw_toggle_reqs() - Toggle acknowledge bits to send an event to the FW ++ * @__iface: The interface to operate on. ++ * @__in_reg: Name of the register to update in the input section of the interface. ++ * @__out_reg: Name of the register to take as a reference in the output section of the ++ * interface. ++ * @__mask: Mask to apply to the update. ++ * ++ * The Host -> FW event/message passing was designed to be lockless, with each side of ++ * the channel having its writeable section. Events are signaled as a difference between ++ * the host and FW side in the req/ack registers (when a bit differs, there's an event ++ * pending, when they are the same, nothing needs attention). ++ * ++ * This helper allows one to update the req register based on the current value of the ++ * ack register managed by the FW. Toggling a specific bit will flag an event. In order ++ * for events to be re-evaluated, the interface doorbell needs to be rung. ++ * ++ * Concurrent accesses to the same req register is covered. ++ * ++ * Anything requiring atomic updates to multiple registers requires a dedicated lock. ++ */ ++#define panthor_fw_toggle_reqs(__iface, __in_reg, __out_reg, __mask) \ ++ do { \ ++ u32 __cur_val, __new_val, __out_val; \ ++ spin_lock(&(__iface)->lock); \ ++ __cur_val = READ_ONCE((__iface)->input->__in_reg); \ ++ __out_val = READ_ONCE((__iface)->output->__out_reg); \ ++ __new_val = ((__out_val ^ (__mask)) & (__mask)) | (__cur_val & ~(__mask)); \ ++ WRITE_ONCE((__iface)->input->__in_reg, __new_val); \ ++ spin_unlock(&(__iface)->lock); \ ++ } while (0) ++ ++/** ++ * panthor_fw_update_reqs() - Update bits to reflect a configuration change ++ * @__iface: The interface to operate on. ++ * @__in_reg: Name of the register to update in the input section of the interface. ++ * @__val: Value to set. ++ * @__mask: Mask to apply to the update. ++ * ++ * Some configuration get passed through req registers that are also used to ++ * send events to the FW. Those req registers being updated from the interrupt ++ * handler, they require special helpers to update the configuration part as well. ++ * ++ * Concurrent accesses to the same req register is covered. ++ * ++ * Anything requiring atomic updates to multiple registers requires a dedicated lock. ++ */ ++#define panthor_fw_update_reqs(__iface, __in_reg, __val, __mask) \ ++ do { \ ++ u32 __cur_val, __new_val; \ ++ spin_lock(&(__iface)->lock); \ ++ __cur_val = READ_ONCE((__iface)->input->__in_reg); \ ++ __new_val = (__cur_val & ~(__mask)) | ((__val) & (__mask)); \ ++ WRITE_ONCE((__iface)->input->__in_reg, __new_val); \ ++ spin_unlock(&(__iface)->lock); \ ++ } while (0) ++ ++struct panthor_fw_global_iface * ++panthor_fw_get_glb_iface(struct panthor_device *ptdev); ++ ++struct panthor_fw_csg_iface * ++panthor_fw_get_csg_iface(struct panthor_device *ptdev, u32 csg_slot); ++ ++struct panthor_fw_cs_iface * ++panthor_fw_get_cs_iface(struct panthor_device *ptdev, u32 csg_slot, u32 cs_slot); ++ ++int panthor_fw_csg_wait_acks(struct panthor_device *ptdev, u32 csg_id, u32 req_mask, ++ u32 *acked, u32 timeout_ms); ++ ++int panthor_fw_glb_wait_acks(struct panthor_device *ptdev, u32 req_mask, u32 *acked, ++ u32 timeout_ms); ++ ++void panthor_fw_ring_csg_doorbells(struct panthor_device *ptdev, u32 csg_slot); ++ ++struct panthor_kernel_bo * ++panthor_fw_alloc_queue_iface_mem(struct panthor_device *ptdev, ++ struct panthor_fw_ringbuf_input_iface **input, ++ const struct panthor_fw_ringbuf_output_iface **output, ++ u32 *input_fw_va, u32 *output_fw_va); ++struct panthor_kernel_bo * ++panthor_fw_alloc_suspend_buf_mem(struct panthor_device *ptdev, size_t size); ++ ++struct panthor_vm *panthor_fw_vm(struct panthor_device *ptdev); ++ ++void panthor_fw_pre_reset(struct panthor_device *ptdev, bool on_hang); ++int panthor_fw_post_reset(struct panthor_device *ptdev); ++ ++static inline void panthor_fw_suspend(struct panthor_device *ptdev) ++{ ++ panthor_fw_pre_reset(ptdev, false); ++} ++ ++static inline int panthor_fw_resume(struct panthor_device *ptdev) ++{ ++ return panthor_fw_post_reset(ptdev); ++} ++ ++int panthor_fw_init(struct panthor_device *ptdev); ++void panthor_fw_unplug(struct panthor_device *ptdev); ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:23 +0100 +Subject: drm/panthor: Add the heap logical block + +Tiler heap growing requires some kernel driver involvement: when the +tiler runs out of heap memory, it will raise an exception which is +either directly handled by the firmware if some free heap chunks are +available in the heap context, or passed back to the kernel otherwise. +The heap helpers will be used by the scheduler logic to allocate more +heap chunks to a heap context, when such a situation happens. + +Heap context creation is explicitly requested by userspace (using +the TILER_HEAP_CREATE ioctl), and the returned context is attached to a +queue through some command stream instruction. + +All the kernel does is keep the list of heap chunks allocated to a +context, so they can be freed when TILER_HEAP_DESTROY is called, or +extended when the FW requests a new chunk. + +v6: +- Add Maxime's and Heiko's acks + +v5: +- Fix FIXME comment +- Add Steve's R-b + +v4: +- Rework locking to allow concurrent calls to panthor_heap_grow() +- Add a helper to return a heap chunk if we couldn't pass it to the + FW because the group was scheduled out + +v3: +- Add a FIXME for the heap OOM deadlock +- Use the panthor_kernel_bo abstraction for the heap context and heap + chunks +- Drop the panthor_heap_gpu_ctx struct as it is opaque to the driver +- Ensure that the heap context is aligned to the GPU cache line size +- Minor code tidy ups + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_heap.c | 597 ++++++++++ + drivers/gpu/drm/panthor/panthor_heap.h | 39 + + 2 files changed, 636 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_heap.c b/drivers/gpu/drm/panthor/panthor_heap.c +new file mode 100644 +index 000000000000..143fa35f2e74 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_heap.c +@@ -0,0 +1,597 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2023 Collabora ltd. */ ++ ++#include ++#include ++ ++#include ++ ++#include "panthor_device.h" ++#include "panthor_gem.h" ++#include "panthor_heap.h" ++#include "panthor_mmu.h" ++#include "panthor_regs.h" ++ ++/* ++ * The GPU heap context is an opaque structure used by the GPU to track the ++ * heap allocations. The driver should only touch it to initialize it (zero all ++ * fields). Because the CPU and GPU can both access this structure it is ++ * required to be GPU cache line aligned. ++ */ ++#define HEAP_CONTEXT_SIZE 32 ++ ++/** ++ * struct panthor_heap_chunk_header - Heap chunk header ++ */ ++struct panthor_heap_chunk_header { ++ /** ++ * @next: Next heap chunk in the list. ++ * ++ * This is a GPU VA. ++ */ ++ u64 next; ++ ++ /** @unknown: MBZ. */ ++ u32 unknown[14]; ++}; ++ ++/** ++ * struct panthor_heap_chunk - Structure used to keep track of allocated heap chunks. ++ */ ++struct panthor_heap_chunk { ++ /** @node: Used to insert the heap chunk in panthor_heap::chunks. */ ++ struct list_head node; ++ ++ /** @bo: Buffer object backing the heap chunk. */ ++ struct panthor_kernel_bo *bo; ++}; ++ ++/** ++ * struct panthor_heap - Structure used to manage tiler heap contexts. ++ */ ++struct panthor_heap { ++ /** @chunks: List containing all heap chunks allocated so far. */ ++ struct list_head chunks; ++ ++ /** @lock: Lock protecting insertion in the chunks list. */ ++ struct mutex lock; ++ ++ /** @chunk_size: Size of each chunk. */ ++ u32 chunk_size; ++ ++ /** @max_chunks: Maximum number of chunks. */ ++ u32 max_chunks; ++ ++ /** ++ * @target_in_flight: Number of in-flight render passes after which ++ * we'd let the FW wait for fragment job to finish instead of allocating new chunks. ++ */ ++ u32 target_in_flight; ++ ++ /** @chunk_count: Number of heap chunks currently allocated. */ ++ u32 chunk_count; ++}; ++ ++#define MAX_HEAPS_PER_POOL 128 ++ ++/** ++ * struct panthor_heap_pool - Pool of heap contexts ++ * ++ * The pool is attached to a panthor_file and can't be shared across processes. ++ */ ++struct panthor_heap_pool { ++ /** @refcount: Reference count. */ ++ struct kref refcount; ++ ++ /** @ptdev: Device. */ ++ struct panthor_device *ptdev; ++ ++ /** @vm: VM this pool is bound to. */ ++ struct panthor_vm *vm; ++ ++ /** @lock: Lock protecting access to @xa. */ ++ struct rw_semaphore lock; ++ ++ /** @xa: Array storing panthor_heap objects. */ ++ struct xarray xa; ++ ++ /** @gpu_contexts: Buffer object containing the GPU heap contexts. */ ++ struct panthor_kernel_bo *gpu_contexts; ++}; ++ ++static int panthor_heap_ctx_stride(struct panthor_device *ptdev) ++{ ++ u32 l2_features = ptdev->gpu_info.l2_features; ++ u32 gpu_cache_line_size = GPU_L2_FEATURES_LINE_SIZE(l2_features); ++ ++ return ALIGN(HEAP_CONTEXT_SIZE, gpu_cache_line_size); ++} ++ ++static int panthor_get_heap_ctx_offset(struct panthor_heap_pool *pool, int id) ++{ ++ return panthor_heap_ctx_stride(pool->ptdev) * id; ++} ++ ++static void *panthor_get_heap_ctx(struct panthor_heap_pool *pool, int id) ++{ ++ return pool->gpu_contexts->kmap + ++ panthor_get_heap_ctx_offset(pool, id); ++} ++ ++static void panthor_free_heap_chunk(struct panthor_vm *vm, ++ struct panthor_heap *heap, ++ struct panthor_heap_chunk *chunk) ++{ ++ mutex_lock(&heap->lock); ++ list_del(&chunk->node); ++ heap->chunk_count--; ++ mutex_unlock(&heap->lock); ++ ++ panthor_kernel_bo_destroy(vm, chunk->bo); ++ kfree(chunk); ++} ++ ++static int panthor_alloc_heap_chunk(struct panthor_device *ptdev, ++ struct panthor_vm *vm, ++ struct panthor_heap *heap, ++ bool initial_chunk) ++{ ++ struct panthor_heap_chunk *chunk; ++ struct panthor_heap_chunk_header *hdr; ++ int ret; ++ ++ chunk = kmalloc(sizeof(*chunk), GFP_KERNEL); ++ if (!chunk) ++ return -ENOMEM; ++ ++ chunk->bo = panthor_kernel_bo_create(ptdev, vm, heap->chunk_size, ++ DRM_PANTHOR_BO_NO_MMAP, ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC, ++ PANTHOR_VM_KERNEL_AUTO_VA); ++ if (IS_ERR(chunk->bo)) { ++ ret = PTR_ERR(chunk->bo); ++ goto err_free_chunk; ++ } ++ ++ ret = panthor_kernel_bo_vmap(chunk->bo); ++ if (ret) ++ goto err_destroy_bo; ++ ++ hdr = chunk->bo->kmap; ++ memset(hdr, 0, sizeof(*hdr)); ++ ++ if (initial_chunk && !list_empty(&heap->chunks)) { ++ struct panthor_heap_chunk *prev_chunk; ++ u64 prev_gpuva; ++ ++ prev_chunk = list_first_entry(&heap->chunks, ++ struct panthor_heap_chunk, ++ node); ++ ++ prev_gpuva = panthor_kernel_bo_gpuva(prev_chunk->bo); ++ hdr->next = (prev_gpuva & GENMASK_ULL(63, 12)) | ++ (heap->chunk_size >> 12); ++ } ++ ++ panthor_kernel_bo_vunmap(chunk->bo); ++ ++ mutex_lock(&heap->lock); ++ list_add(&chunk->node, &heap->chunks); ++ heap->chunk_count++; ++ mutex_unlock(&heap->lock); ++ ++ return 0; ++ ++err_destroy_bo: ++ panthor_kernel_bo_destroy(vm, chunk->bo); ++ ++err_free_chunk: ++ kfree(chunk); ++ ++ return ret; ++} ++ ++static void panthor_free_heap_chunks(struct panthor_vm *vm, ++ struct panthor_heap *heap) ++{ ++ struct panthor_heap_chunk *chunk, *tmp; ++ ++ list_for_each_entry_safe(chunk, tmp, &heap->chunks, node) ++ panthor_free_heap_chunk(vm, heap, chunk); ++} ++ ++static int panthor_alloc_heap_chunks(struct panthor_device *ptdev, ++ struct panthor_vm *vm, ++ struct panthor_heap *heap, ++ u32 chunk_count) ++{ ++ int ret; ++ u32 i; ++ ++ for (i = 0; i < chunk_count; i++) { ++ ret = panthor_alloc_heap_chunk(ptdev, vm, heap, true); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int ++panthor_heap_destroy_locked(struct panthor_heap_pool *pool, u32 handle) ++{ ++ struct panthor_heap *heap; ++ ++ heap = xa_erase(&pool->xa, handle); ++ if (!heap) ++ return -EINVAL; ++ ++ panthor_free_heap_chunks(pool->vm, heap); ++ mutex_destroy(&heap->lock); ++ kfree(heap); ++ return 0; ++} ++ ++/** ++ * panthor_heap_destroy() - Destroy a heap context ++ * @pool: Pool this context belongs to. ++ * @handle: Handle returned by panthor_heap_create(). ++ */ ++int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle) ++{ ++ int ret; ++ ++ down_write(&pool->lock); ++ ret = panthor_heap_destroy_locked(pool, handle); ++ up_write(&pool->lock); ++ ++ return ret; ++} ++ ++/** ++ * panthor_heap_create() - Create a heap context ++ * @pool: Pool to instantiate the heap context from. ++ * @initial_chunk_count: Number of chunk allocated at initialization time. ++ * Must be at least 1. ++ * @chunk_size: The size of each chunk. Must be a power of two between 256k ++ * and 2M. ++ * @max_chunks: Maximum number of chunks that can be allocated. ++ * @target_in_flight: Maximum number of in-flight render passes. ++ * @heap_ctx_gpu_va: Pointer holding the GPU address of the allocated heap ++ * context. ++ * @first_chunk_gpu_va: Pointer holding the GPU address of the first chunk ++ * assigned to the heap context. ++ * ++ * Return: a positive handle on success, a negative error otherwise. ++ */ ++int panthor_heap_create(struct panthor_heap_pool *pool, ++ u32 initial_chunk_count, ++ u32 chunk_size, ++ u32 max_chunks, ++ u32 target_in_flight, ++ u64 *heap_ctx_gpu_va, ++ u64 *first_chunk_gpu_va) ++{ ++ struct panthor_heap *heap; ++ struct panthor_heap_chunk *first_chunk; ++ struct panthor_vm *vm; ++ int ret = 0; ++ u32 id; ++ ++ if (initial_chunk_count == 0) ++ return -EINVAL; ++ ++ if (hweight32(chunk_size) != 1 || ++ chunk_size < SZ_256K || chunk_size > SZ_2M) ++ return -EINVAL; ++ ++ down_read(&pool->lock); ++ vm = panthor_vm_get(pool->vm); ++ up_read(&pool->lock); ++ ++ /* The pool has been destroyed, we can't create a new heap. */ ++ if (!vm) ++ return -EINVAL; ++ ++ heap = kzalloc(sizeof(*heap), GFP_KERNEL); ++ if (!heap) { ++ ret = -ENOMEM; ++ goto err_put_vm; ++ } ++ ++ mutex_init(&heap->lock); ++ INIT_LIST_HEAD(&heap->chunks); ++ heap->chunk_size = chunk_size; ++ heap->max_chunks = max_chunks; ++ heap->target_in_flight = target_in_flight; ++ ++ ret = panthor_alloc_heap_chunks(pool->ptdev, vm, heap, ++ initial_chunk_count); ++ if (ret) ++ goto err_free_heap; ++ ++ first_chunk = list_first_entry(&heap->chunks, ++ struct panthor_heap_chunk, ++ node); ++ *first_chunk_gpu_va = panthor_kernel_bo_gpuva(first_chunk->bo); ++ ++ down_write(&pool->lock); ++ /* The pool has been destroyed, we can't create a new heap. */ ++ if (!pool->vm) { ++ ret = -EINVAL; ++ } else { ++ ret = xa_alloc(&pool->xa, &id, heap, XA_LIMIT(1, MAX_HEAPS_PER_POOL), GFP_KERNEL); ++ if (!ret) { ++ void *gpu_ctx = panthor_get_heap_ctx(pool, id); ++ ++ memset(gpu_ctx, 0, panthor_heap_ctx_stride(pool->ptdev)); ++ *heap_ctx_gpu_va = panthor_kernel_bo_gpuva(pool->gpu_contexts) + ++ panthor_get_heap_ctx_offset(pool, id); ++ } ++ } ++ up_write(&pool->lock); ++ ++ if (ret) ++ goto err_free_heap; ++ ++ panthor_vm_put(vm); ++ return id; ++ ++err_free_heap: ++ panthor_free_heap_chunks(pool->vm, heap); ++ mutex_destroy(&heap->lock); ++ kfree(heap); ++ ++err_put_vm: ++ panthor_vm_put(vm); ++ return ret; ++} ++ ++/** ++ * panthor_heap_return_chunk() - Return an unused heap chunk ++ * @pool: The pool this heap belongs to. ++ * @heap_gpu_va: The GPU address of the heap context. ++ * @chunk_gpu_va: The chunk VA to return. ++ * ++ * This function is used when a chunk allocated with panthor_heap_grow() ++ * couldn't be linked to the heap context through the FW interface because ++ * the group requesting the allocation was scheduled out in the meantime. ++ */ ++int panthor_heap_return_chunk(struct panthor_heap_pool *pool, ++ u64 heap_gpu_va, ++ u64 chunk_gpu_va) ++{ ++ u64 offset = heap_gpu_va - panthor_kernel_bo_gpuva(pool->gpu_contexts); ++ u32 heap_id = (u32)offset / panthor_heap_ctx_stride(pool->ptdev); ++ struct panthor_heap_chunk *chunk, *tmp, *removed = NULL; ++ struct panthor_heap *heap; ++ int ret; ++ ++ if (offset > U32_MAX || heap_id >= MAX_HEAPS_PER_POOL) ++ return -EINVAL; ++ ++ down_read(&pool->lock); ++ heap = xa_load(&pool->xa, heap_id); ++ if (!heap) { ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ chunk_gpu_va &= GENMASK_ULL(63, 12); ++ ++ mutex_lock(&heap->lock); ++ list_for_each_entry_safe(chunk, tmp, &heap->chunks, node) { ++ if (panthor_kernel_bo_gpuva(chunk->bo) == chunk_gpu_va) { ++ removed = chunk; ++ list_del(&chunk->node); ++ heap->chunk_count--; ++ break; ++ } ++ } ++ mutex_unlock(&heap->lock); ++ ++ if (removed) { ++ panthor_kernel_bo_destroy(pool->vm, chunk->bo); ++ kfree(chunk); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ ++out_unlock: ++ up_read(&pool->lock); ++ return ret; ++} ++ ++/** ++ * panthor_heap_grow() - Make a heap context grow. ++ * @pool: The pool this heap belongs to. ++ * @heap_gpu_va: The GPU address of the heap context. ++ * @renderpasses_in_flight: Number of render passes currently in-flight. ++ * @pending_frag_count: Number of fragment jobs waiting for execution/completion. ++ * @new_chunk_gpu_va: Pointer used to return the chunk VA. ++ */ ++int panthor_heap_grow(struct panthor_heap_pool *pool, ++ u64 heap_gpu_va, ++ u32 renderpasses_in_flight, ++ u32 pending_frag_count, ++ u64 *new_chunk_gpu_va) ++{ ++ u64 offset = heap_gpu_va - panthor_kernel_bo_gpuva(pool->gpu_contexts); ++ u32 heap_id = (u32)offset / panthor_heap_ctx_stride(pool->ptdev); ++ struct panthor_heap_chunk *chunk; ++ struct panthor_heap *heap; ++ int ret; ++ ++ if (offset > U32_MAX || heap_id >= MAX_HEAPS_PER_POOL) ++ return -EINVAL; ++ ++ down_read(&pool->lock); ++ heap = xa_load(&pool->xa, heap_id); ++ if (!heap) { ++ ret = -EINVAL; ++ goto out_unlock; ++ } ++ ++ /* If we reached the target in-flight render passes, or if we ++ * reached the maximum number of chunks, let the FW figure another way to ++ * find some memory (wait for render passes to finish, or call the exception ++ * handler provided by the userspace driver, if any). ++ */ ++ if (renderpasses_in_flight > heap->target_in_flight || ++ (pending_frag_count > 0 && heap->chunk_count >= heap->max_chunks)) { ++ ret = -EBUSY; ++ goto out_unlock; ++ } else if (heap->chunk_count >= heap->max_chunks) { ++ ret = -ENOMEM; ++ goto out_unlock; ++ } ++ ++ /* FIXME: panthor_alloc_heap_chunk() triggers a kernel BO creation, ++ * which goes through the blocking allocation path. Ultimately, we ++ * want a non-blocking allocation, so we can immediately report to the ++ * FW when the system is running out of memory. In that case, the FW ++ * can call a user-provided exception handler, which might try to free ++ * some tiler memory by issuing an intermediate fragment job. If the ++ * exception handler can't do anything, it will flag the queue as ++ * faulty so the job that triggered this tiler chunk allocation and all ++ * further jobs in this queue fail immediately instead of having to ++ * wait for the job timeout. ++ */ ++ ret = panthor_alloc_heap_chunk(pool->ptdev, pool->vm, heap, false); ++ if (ret) ++ goto out_unlock; ++ ++ chunk = list_first_entry(&heap->chunks, ++ struct panthor_heap_chunk, ++ node); ++ *new_chunk_gpu_va = (panthor_kernel_bo_gpuva(chunk->bo) & GENMASK_ULL(63, 12)) | ++ (heap->chunk_size >> 12); ++ ret = 0; ++ ++out_unlock: ++ up_read(&pool->lock); ++ return ret; ++} ++ ++static void panthor_heap_pool_release(struct kref *refcount) ++{ ++ struct panthor_heap_pool *pool = ++ container_of(refcount, struct panthor_heap_pool, refcount); ++ ++ xa_destroy(&pool->xa); ++ kfree(pool); ++} ++ ++/** ++ * panthor_heap_pool_put() - Release a heap pool reference ++ * @pool: Pool to release the reference on. Can be NULL. ++ */ ++void panthor_heap_pool_put(struct panthor_heap_pool *pool) ++{ ++ if (pool) ++ kref_put(&pool->refcount, panthor_heap_pool_release); ++} ++ ++/** ++ * panthor_heap_pool_get() - Get a heap pool reference ++ * @pool: Pool to get the reference on. Can be NULL. ++ * ++ * Return: @pool. ++ */ ++struct panthor_heap_pool * ++panthor_heap_pool_get(struct panthor_heap_pool *pool) ++{ ++ if (pool) ++ kref_get(&pool->refcount); ++ ++ return pool; ++} ++ ++/** ++ * panthor_heap_pool_create() - Create a heap pool ++ * @ptdev: Device. ++ * @vm: The VM this heap pool will be attached to. ++ * ++ * Heap pools might contain up to 128 heap contexts, and are per-VM. ++ * ++ * Return: A valid pointer on success, a negative error code otherwise. ++ */ ++struct panthor_heap_pool * ++panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm) ++{ ++ size_t bosize = ALIGN(MAX_HEAPS_PER_POOL * ++ panthor_heap_ctx_stride(ptdev), ++ 4096); ++ struct panthor_heap_pool *pool; ++ int ret = 0; ++ ++ pool = kzalloc(sizeof(*pool), GFP_KERNEL); ++ if (!pool) ++ return ERR_PTR(-ENOMEM); ++ ++ /* We want a weak ref here: the heap pool belongs to the VM, so we're ++ * sure that, as long as the heap pool exists, the VM exists too. ++ */ ++ pool->vm = vm; ++ pool->ptdev = ptdev; ++ init_rwsem(&pool->lock); ++ xa_init_flags(&pool->xa, XA_FLAGS_ALLOC1); ++ kref_init(&pool->refcount); ++ ++ pool->gpu_contexts = panthor_kernel_bo_create(ptdev, vm, bosize, ++ DRM_PANTHOR_BO_NO_MMAP, ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC, ++ PANTHOR_VM_KERNEL_AUTO_VA); ++ if (IS_ERR(pool->gpu_contexts)) { ++ ret = PTR_ERR(pool->gpu_contexts); ++ goto err_destroy_pool; ++ } ++ ++ ret = panthor_kernel_bo_vmap(pool->gpu_contexts); ++ if (ret) ++ goto err_destroy_pool; ++ ++ return pool; ++ ++err_destroy_pool: ++ panthor_heap_pool_destroy(pool); ++ return ERR_PTR(ret); ++} ++ ++/** ++ * panthor_heap_pool_destroy() - Destroy a heap pool. ++ * @pool: Pool to destroy. ++ * ++ * This function destroys all heap contexts and their resources. Thus ++ * preventing any use of the heap context or the chunk attached to them ++ * after that point. ++ * ++ * If the GPU still has access to some heap contexts, a fault should be ++ * triggered, which should flag the command stream groups using these ++ * context as faulty. ++ * ++ * The heap pool object is only released when all references to this pool ++ * are released. ++ */ ++void panthor_heap_pool_destroy(struct panthor_heap_pool *pool) ++{ ++ struct panthor_heap *heap; ++ unsigned long i; ++ ++ if (!pool) ++ return; ++ ++ down_write(&pool->lock); ++ xa_for_each(&pool->xa, i, heap) ++ drm_WARN_ON(&pool->ptdev->base, panthor_heap_destroy_locked(pool, i)); ++ ++ if (!IS_ERR_OR_NULL(pool->gpu_contexts)) ++ panthor_kernel_bo_destroy(pool->vm, pool->gpu_contexts); ++ ++ /* Reflects the fact the pool has been destroyed. */ ++ pool->vm = NULL; ++ up_write(&pool->lock); ++ ++ panthor_heap_pool_put(pool); ++} +diff --git a/drivers/gpu/drm/panthor/panthor_heap.h b/drivers/gpu/drm/panthor/panthor_heap.h +new file mode 100644 +index 000000000000..25a5f2bba445 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_heap.h +@@ -0,0 +1,39 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifndef __PANTHOR_HEAP_H__ ++#define __PANTHOR_HEAP_H__ ++ ++#include ++ ++struct panthor_device; ++struct panthor_heap_pool; ++struct panthor_vm; ++ ++int panthor_heap_create(struct panthor_heap_pool *pool, ++ u32 initial_chunk_count, ++ u32 chunk_size, ++ u32 max_chunks, ++ u32 target_in_flight, ++ u64 *heap_ctx_gpu_va, ++ u64 *first_chunk_gpu_va); ++int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle); ++ ++struct panthor_heap_pool * ++panthor_heap_pool_create(struct panthor_device *ptdev, struct panthor_vm *vm); ++void panthor_heap_pool_destroy(struct panthor_heap_pool *pool); ++ ++struct panthor_heap_pool * ++panthor_heap_pool_get(struct panthor_heap_pool *pool); ++void panthor_heap_pool_put(struct panthor_heap_pool *pool); ++ ++int panthor_heap_grow(struct panthor_heap_pool *pool, ++ u64 heap_gpu_va, ++ u32 renderpasses_in_flight, ++ u32 pending_frag_count, ++ u64 *new_chunk_gpu_va); ++int panthor_heap_return_chunk(struct panthor_heap_pool *pool, ++ u64 heap_gpu_va, ++ u64 chunk_gpu_va); ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:24 +0100 +Subject: drm/panthor: Add the scheduler logical block + +This is the piece of software interacting with the FW scheduler, and +taking care of some scheduling aspects when the FW comes short of slots +scheduling slots. Indeed, the FW only expose a few slots, and the kernel +has to give all submission contexts, a chance to execute their jobs. + +The kernel-side scheduler is timeslice-based, with a round-robin queue +per priority level. + +Job submission is handled with a 1:1 drm_sched_entity:drm_gpu_scheduler, +allowing us to delegate the dependency tracking to the core. + +All the gory details should be documented inline. + +v6: +- Add Maxime's and Heiko's acks +- Make sure the scheduler is initialized before queueing the tick work + in the MMU fault handler +- Keep header inclusion alphabetically ordered + +v5: +- Fix typos +- Call panthor_kernel_bo_destroy(group->syncobjs) unconditionally +- Don't move the group to the waiting list tail when it was already + waiting for a different syncobj +- Fix fatal_queues flagging in the tiler OOM path +- Don't warn when more than one job timesout on a group +- Add a warning message when we fail to allocate a heap chunk +- Add Steve's R-b + +v4: +- Check drmm_mutex_init() return code +- s/drm_gem_vmap_unlocked/drm_gem_vunmap_unlocked/ in + panthor_queue_put_syncwait_obj() +- Drop unneeded WARN_ON() in cs_slot_sync_queue_state_locked() +- Use atomic_xchg() instead of atomic_fetch_and(0) +- Fix typos +- Let panthor_kernel_bo_destroy() check for IS_ERR_OR_NULL() BOs +- Defer TILER_OOM event handling to a separate workqueue to prevent + deadlocks when the heap chunk allocation is blocked on mem-reclaim. + This is just a temporary solution, until we add support for + non-blocking/failable allocations +- Pass the scheduler workqueue to drm_sched instead of instantiating + a separate one (no longer needed now that heap chunk allocation + happens on a dedicated wq) +- Set WQ_MEM_RECLAIM on the scheduler workqueue, so we can handle + job timeouts when the system is under mem pressure, and hopefully + free up some memory retained by these jobs + +v3: +- Rework the FW event handling logic to avoid races +- Make sure MMU faults kill the group immediately +- Use the panthor_kernel_bo abstraction for group/queue buffers +- Make in_progress an atomic_t, so we can check it without the reset lock + held +- Don't limit the number of groups per context to the FW scheduler + capacity. Fix the limit to 128 for now. +- Add a panthor_job_vm() helper +- Account for panthor_vm changes +- Add our job fence as DMA_RESV_USAGE_WRITE to all external objects + (was previously DMA_RESV_USAGE_BOOKKEEP). I don't get why, given + we're supposed to be fully-explicit, but other drivers do that, so + there must be a good reason +- Account for drm_sched changes +- Provide a panthor_queue_put_syncwait_obj() +- Unconditionally return groups to their idle list in + panthor_sched_suspend() +- Condition of sched_queue_{,delayed_}work fixed to be only when a reset + isn't pending or in progress. +- Several typos in comments fixed. + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_sched.c | 3502 ++++++++++ + drivers/gpu/drm/panthor/panthor_sched.h | 50 + + 2 files changed, 3552 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c +new file mode 100644 +index 000000000000..5f7803b6fc48 +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_sched.c +@@ -0,0 +1,3502 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2023 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "panthor_devfreq.h" ++#include "panthor_device.h" ++#include "panthor_fw.h" ++#include "panthor_gem.h" ++#include "panthor_gpu.h" ++#include "panthor_heap.h" ++#include "panthor_mmu.h" ++#include "panthor_regs.h" ++#include "panthor_sched.h" ++ ++/** ++ * DOC: Scheduler ++ * ++ * Mali CSF hardware adopts a firmware-assisted scheduling model, where ++ * the firmware takes care of scheduling aspects, to some extent. ++ * ++ * The scheduling happens at the scheduling group level, each group ++ * contains 1 to N queues (N is FW/hardware dependent, and exposed ++ * through the firmware interface). Each queue is assigned a command ++ * stream ring buffer, which serves as a way to get jobs submitted to ++ * the GPU, among other things. ++ * ++ * The firmware can schedule a maximum of M groups (M is FW/hardware ++ * dependent, and exposed through the firmware interface). Passed ++ * this maximum number of groups, the kernel must take care of ++ * rotating the groups passed to the firmware so every group gets ++ * a chance to have his queues scheduled for execution. ++ * ++ * The current implementation only supports with kernel-mode queues. ++ * In other terms, userspace doesn't have access to the ring-buffer. ++ * Instead, userspace passes indirect command stream buffers that are ++ * called from the queue ring-buffer by the kernel using a pre-defined ++ * sequence of command stream instructions to ensure the userspace driver ++ * always gets consistent results (cache maintenance, ++ * synchronization, ...). ++ * ++ * We rely on the drm_gpu_scheduler framework to deal with job ++ * dependencies and submission. As any other driver dealing with a ++ * FW-scheduler, we use the 1:1 entity:scheduler mode, such that each ++ * entity has its own job scheduler. When a job is ready to be executed ++ * (all its dependencies are met), it is pushed to the appropriate ++ * queue ring-buffer, and the group is scheduled for execution if it ++ * wasn't already active. ++ * ++ * Kernel-side group scheduling is timeslice-based. When we have less ++ * groups than there are slots, the periodic tick is disabled and we ++ * just let the FW schedule the active groups. When there are more ++ * groups than slots, we let each group a chance to execute stuff for ++ * a given amount of time, and then re-evaluate and pick new groups ++ * to schedule. The group selection algorithm is based on ++ * priority+round-robin. ++ * ++ * Even though user-mode queues is out of the scope right now, the ++ * current design takes them into account by avoiding any guess on the ++ * group/queue state that would be based on information we wouldn't have ++ * if userspace was in charge of the ring-buffer. That's also one of the ++ * reason we don't do 'cooperative' scheduling (encoding FW group slot ++ * reservation as dma_fence that would be returned from the ++ * drm_gpu_scheduler::prepare_job() hook, and treating group rotation as ++ * a queue of waiters, ordered by job submission order). This approach ++ * would work for kernel-mode queues, but would make user-mode queues a ++ * lot more complicated to retrofit. ++ */ ++ ++#define JOB_TIMEOUT_MS 5000 ++ ++#define MIN_CS_PER_CSG 8 ++ ++#define MIN_CSGS 3 ++#define MAX_CSG_PRIO 0xf ++ ++struct panthor_group; ++ ++/** ++ * struct panthor_csg_slot - Command stream group slot ++ * ++ * This represents a FW slot for a scheduling group. ++ */ ++struct panthor_csg_slot { ++ /** @group: Scheduling group bound to this slot. */ ++ struct panthor_group *group; ++ ++ /** @priority: Group priority. */ ++ u8 priority; ++ ++ /** ++ * @idle: True if the group bound to this slot is idle. ++ * ++ * A group is idle when it has nothing waiting for execution on ++ * all its queues, or when queues are blocked waiting for something ++ * to happen (synchronization object). ++ */ ++ bool idle; ++}; ++ ++/** ++ * enum panthor_csg_priority - Group priority ++ */ ++enum panthor_csg_priority { ++ /** @PANTHOR_CSG_PRIORITY_LOW: Low priority group. */ ++ PANTHOR_CSG_PRIORITY_LOW = 0, ++ ++ /** @PANTHOR_CSG_PRIORITY_MEDIUM: Medium priority group. */ ++ PANTHOR_CSG_PRIORITY_MEDIUM, ++ ++ /** @PANTHOR_CSG_PRIORITY_HIGH: High priority group. */ ++ PANTHOR_CSG_PRIORITY_HIGH, ++ ++ /** ++ * @PANTHOR_CSG_PRIORITY_RT: Real-time priority group. ++ * ++ * Real-time priority allows one to preempt scheduling of other ++ * non-real-time groups. When such a group becomes executable, ++ * it will evict the group with the lowest non-rt priority if ++ * there's no free group slot available. ++ * ++ * Currently not exposed to userspace. ++ */ ++ PANTHOR_CSG_PRIORITY_RT, ++ ++ /** @PANTHOR_CSG_PRIORITY_COUNT: Number of priority levels. */ ++ PANTHOR_CSG_PRIORITY_COUNT, ++}; ++ ++/** ++ * struct panthor_scheduler - Object used to manage the scheduler ++ */ ++struct panthor_scheduler { ++ /** @ptdev: Device. */ ++ struct panthor_device *ptdev; ++ ++ /** ++ * @wq: Workqueue used by our internal scheduler logic and ++ * drm_gpu_scheduler. ++ * ++ * Used for the scheduler tick, group update or other kind of FW ++ * event processing that can't be handled in the threaded interrupt ++ * path. Also passed to the drm_gpu_scheduler instances embedded ++ * in panthor_queue. ++ */ ++ struct workqueue_struct *wq; ++ ++ /** ++ * @heap_alloc_wq: Workqueue used to schedule tiler_oom works. ++ * ++ * We have a queue dedicated to heap chunk allocation works to avoid ++ * blocking the rest of the scheduler if the allocation tries to ++ * reclaim memory. ++ */ ++ struct workqueue_struct *heap_alloc_wq; ++ ++ /** @tick_work: Work executed on a scheduling tick. */ ++ struct delayed_work tick_work; ++ ++ /** ++ * @sync_upd_work: Work used to process synchronization object updates. ++ * ++ * We use this work to unblock queues/groups that were waiting on a ++ * synchronization object. ++ */ ++ struct work_struct sync_upd_work; ++ ++ /** ++ * @fw_events_work: Work used to process FW events outside the interrupt path. ++ * ++ * Even if the interrupt is threaded, we need any event processing ++ * that require taking the panthor_scheduler::lock to be processed ++ * outside the interrupt path so we don't block the tick logic when ++ * it calls panthor_fw_{csg,wait}_wait_acks(). Since most of the ++ * event processing requires taking this lock, we just delegate all ++ * FW event processing to the scheduler workqueue. ++ */ ++ struct work_struct fw_events_work; ++ ++ /** ++ * @fw_events: Bitmask encoding pending FW events. ++ */ ++ atomic_t fw_events; ++ ++ /** ++ * @resched_target: When the next tick should occur. ++ * ++ * Expressed in jiffies. ++ */ ++ u64 resched_target; ++ ++ /** ++ * @last_tick: When the last tick occurred. ++ * ++ * Expressed in jiffies. ++ */ ++ u64 last_tick; ++ ++ /** @tick_period: Tick period in jiffies. */ ++ u64 tick_period; ++ ++ /** ++ * @lock: Lock protecting access to all the scheduler fields. ++ * ++ * Should be taken in the tick work, the irq handler, and anywhere the @groups ++ * fields are touched. ++ */ ++ struct mutex lock; ++ ++ /** @groups: Various lists used to classify groups. */ ++ struct { ++ /** ++ * @runnable: Runnable group lists. ++ * ++ * When a group has queues that want to execute something, ++ * its panthor_group::run_node should be inserted here. ++ * ++ * One list per-priority. ++ */ ++ struct list_head runnable[PANTHOR_CSG_PRIORITY_COUNT]; ++ ++ /** ++ * @idle: Idle group lists. ++ * ++ * When all queues of a group are idle (either because they ++ * have nothing to execute, or because they are blocked), the ++ * panthor_group::run_node field should be inserted here. ++ * ++ * One list per-priority. ++ */ ++ struct list_head idle[PANTHOR_CSG_PRIORITY_COUNT]; ++ ++ /** ++ * @waiting: List of groups whose queues are blocked on a ++ * synchronization object. ++ * ++ * Insert panthor_group::wait_node here when a group is waiting ++ * for synchronization objects to be signaled. ++ * ++ * This list is evaluated in the @sync_upd_work work. ++ */ ++ struct list_head waiting; ++ } groups; ++ ++ /** ++ * @csg_slots: FW command stream group slots. ++ */ ++ struct panthor_csg_slot csg_slots[MAX_CSGS]; ++ ++ /** @csg_slot_count: Number of command stream group slots exposed by the FW. */ ++ u32 csg_slot_count; ++ ++ /** @cs_slot_count: Number of command stream slot per group slot exposed by the FW. */ ++ u32 cs_slot_count; ++ ++ /** @as_slot_count: Number of address space slots supported by the MMU. */ ++ u32 as_slot_count; ++ ++ /** @used_csg_slot_count: Number of command stream group slot currently used. */ ++ u32 used_csg_slot_count; ++ ++ /** @sb_slot_count: Number of scoreboard slots. */ ++ u32 sb_slot_count; ++ ++ /** ++ * @might_have_idle_groups: True if an active group might have become idle. ++ * ++ * This will force a tick, so other runnable groups can be scheduled if one ++ * or more active groups became idle. ++ */ ++ bool might_have_idle_groups; ++ ++ /** @pm: Power management related fields. */ ++ struct { ++ /** @has_ref: True if the scheduler owns a runtime PM reference. */ ++ bool has_ref; ++ } pm; ++ ++ /** @reset: Reset related fields. */ ++ struct { ++ /** @lock: Lock protecting the other reset fields. */ ++ struct mutex lock; ++ ++ /** ++ * @in_progress: True if a reset is in progress. ++ * ++ * Set to true in panthor_sched_pre_reset() and back to false in ++ * panthor_sched_post_reset(). ++ */ ++ atomic_t in_progress; ++ ++ /** ++ * @stopped_groups: List containing all groups that were stopped ++ * before a reset. ++ * ++ * Insert panthor_group::run_node in the pre_reset path. ++ */ ++ struct list_head stopped_groups; ++ } reset; ++}; ++ ++/** ++ * struct panthor_syncobj_32b - 32-bit FW synchronization object ++ */ ++struct panthor_syncobj_32b { ++ /** @seqno: Sequence number. */ ++ u32 seqno; ++ ++ /** ++ * @status: Status. ++ * ++ * Not zero on failure. ++ */ ++ u32 status; ++}; ++ ++/** ++ * struct panthor_syncobj_64b - 64-bit FW synchronization object ++ */ ++struct panthor_syncobj_64b { ++ /** @seqno: Sequence number. */ ++ u64 seqno; ++ ++ /** ++ * @status: Status. ++ * ++ * Not zero on failure. ++ */ ++ u32 status; ++ ++ /** @pad: MBZ. */ ++ u32 pad; ++}; ++ ++/** ++ * struct panthor_queue - Execution queue ++ */ ++struct panthor_queue { ++ /** @scheduler: DRM scheduler used for this queue. */ ++ struct drm_gpu_scheduler scheduler; ++ ++ /** @entity: DRM scheduling entity used for this queue. */ ++ struct drm_sched_entity entity; ++ ++ /** ++ * @remaining_time: Time remaining before the job timeout expires. ++ * ++ * The job timeout is suspended when the queue is not scheduled by the ++ * FW. Every time we suspend the timer, we need to save the remaining ++ * time so we can restore it later on. ++ */ ++ unsigned long remaining_time; ++ ++ /** @timeout_suspended: True if the job timeout was suspended. */ ++ bool timeout_suspended; ++ ++ /** ++ * @doorbell_id: Doorbell assigned to this queue. ++ * ++ * Right now, all groups share the same doorbell, and the doorbell ID ++ * is assigned to group_slot + 1 when the group is assigned a slot. But ++ * we might decide to provide fine grained doorbell assignment at some ++ * point, so don't have to wake up all queues in a group every time one ++ * of them is updated. ++ */ ++ u8 doorbell_id; ++ ++ /** ++ * @priority: Priority of the queue inside the group. ++ * ++ * Must be less than 16 (Only 4 bits available). ++ */ ++ u8 priority; ++#define CSF_MAX_QUEUE_PRIO GENMASK(3, 0) ++ ++ /** @ringbuf: Command stream ring-buffer. */ ++ struct panthor_kernel_bo *ringbuf; ++ ++ /** @iface: Firmware interface. */ ++ struct { ++ /** @mem: FW memory allocated for this interface. */ ++ struct panthor_kernel_bo *mem; ++ ++ /** @input: Input interface. */ ++ struct panthor_fw_ringbuf_input_iface *input; ++ ++ /** @output: Output interface. */ ++ const struct panthor_fw_ringbuf_output_iface *output; ++ ++ /** @input_fw_va: FW virtual address of the input interface buffer. */ ++ u32 input_fw_va; ++ ++ /** @output_fw_va: FW virtual address of the output interface buffer. */ ++ u32 output_fw_va; ++ } iface; ++ ++ /** ++ * @syncwait: Stores information about the synchronization object this ++ * queue is waiting on. ++ */ ++ struct { ++ /** @gpu_va: GPU address of the synchronization object. */ ++ u64 gpu_va; ++ ++ /** @ref: Reference value to compare against. */ ++ u64 ref; ++ ++ /** @gt: True if this is a greater-than test. */ ++ bool gt; ++ ++ /** @sync64: True if this is a 64-bit sync object. */ ++ bool sync64; ++ ++ /** @bo: Buffer object holding the synchronization object. */ ++ struct drm_gem_object *obj; ++ ++ /** @offset: Offset of the synchronization object inside @bo. */ ++ u64 offset; ++ ++ /** ++ * @kmap: Kernel mapping of the buffer object holding the ++ * synchronization object. ++ */ ++ void *kmap; ++ } syncwait; ++ ++ /** @fence_ctx: Fence context fields. */ ++ struct { ++ /** @lock: Used to protect access to all fences allocated by this context. */ ++ spinlock_t lock; ++ ++ /** ++ * @id: Fence context ID. ++ * ++ * Allocated with dma_fence_context_alloc(). ++ */ ++ u64 id; ++ ++ /** @seqno: Sequence number of the last initialized fence. */ ++ atomic64_t seqno; ++ ++ /** ++ * @in_flight_jobs: List containing all in-flight jobs. ++ * ++ * Used to keep track and signal panthor_job::done_fence when the ++ * synchronization object attached to the queue is signaled. ++ */ ++ struct list_head in_flight_jobs; ++ } fence_ctx; ++}; ++ ++/** ++ * enum panthor_group_state - Scheduling group state. ++ */ ++enum panthor_group_state { ++ /** @PANTHOR_CS_GROUP_CREATED: Group was created, but not scheduled yet. */ ++ PANTHOR_CS_GROUP_CREATED, ++ ++ /** @PANTHOR_CS_GROUP_ACTIVE: Group is currently scheduled. */ ++ PANTHOR_CS_GROUP_ACTIVE, ++ ++ /** ++ * @PANTHOR_CS_GROUP_SUSPENDED: Group was scheduled at least once, but is ++ * inactive/suspended right now. ++ */ ++ PANTHOR_CS_GROUP_SUSPENDED, ++ ++ /** ++ * @PANTHOR_CS_GROUP_TERMINATED: Group was terminated. ++ * ++ * Can no longer be scheduled. The only allowed action is a destruction. ++ */ ++ PANTHOR_CS_GROUP_TERMINATED, ++}; ++ ++/** ++ * struct panthor_group - Scheduling group object ++ */ ++struct panthor_group { ++ /** @refcount: Reference count */ ++ struct kref refcount; ++ ++ /** @ptdev: Device. */ ++ struct panthor_device *ptdev; ++ ++ /** @vm: VM bound to the group. */ ++ struct panthor_vm *vm; ++ ++ /** @compute_core_mask: Mask of shader cores that can be used for compute jobs. */ ++ u64 compute_core_mask; ++ ++ /** @fragment_core_mask: Mask of shader cores that can be used for fragment jobs. */ ++ u64 fragment_core_mask; ++ ++ /** @tiler_core_mask: Mask of tiler cores that can be used for tiler jobs. */ ++ u64 tiler_core_mask; ++ ++ /** @max_compute_cores: Maximum number of shader cores used for compute jobs. */ ++ u8 max_compute_cores; ++ ++ /** @max_compute_cores: Maximum number of shader cores used for fragment jobs. */ ++ u8 max_fragment_cores; ++ ++ /** @max_tiler_cores: Maximum number of tiler cores used for tiler jobs. */ ++ u8 max_tiler_cores; ++ ++ /** @priority: Group priority (check panthor_csg_priority). */ ++ u8 priority; ++ ++ /** @blocked_queues: Bitmask reflecting the blocked queues. */ ++ u32 blocked_queues; ++ ++ /** @idle_queues: Bitmask reflecting the idle queues. */ ++ u32 idle_queues; ++ ++ /** @fatal_lock: Lock used to protect access to fatal fields. */ ++ spinlock_t fatal_lock; ++ ++ /** @fatal_queues: Bitmask reflecting the queues that hit a fatal exception. */ ++ u32 fatal_queues; ++ ++ /** @tiler_oom: Mask of queues that have a tiler OOM event to process. */ ++ atomic_t tiler_oom; ++ ++ /** @queue_count: Number of queues in this group. */ ++ u32 queue_count; ++ ++ /** @queues: Queues owned by this group. */ ++ struct panthor_queue *queues[MAX_CS_PER_CSG]; ++ ++ /** ++ * @csg_id: ID of the FW group slot. ++ * ++ * -1 when the group is not scheduled/active. ++ */ ++ int csg_id; ++ ++ /** ++ * @destroyed: True when the group has been destroyed. ++ * ++ * If a group is destroyed it becomes useless: no further jobs can be submitted ++ * to its queues. We simply wait for all references to be dropped so we can ++ * release the group object. ++ */ ++ bool destroyed; ++ ++ /** ++ * @timedout: True when a timeout occurred on any of the queues owned by ++ * this group. ++ * ++ * Timeouts can be reported by drm_sched or by the FW. In any case, any ++ * timeout situation is unrecoverable, and the group becomes useless. ++ * We simply wait for all references to be dropped so we can release the ++ * group object. ++ */ ++ bool timedout; ++ ++ /** ++ * @syncobjs: Pool of per-queue synchronization objects. ++ * ++ * One sync object per queue. The position of the sync object is ++ * determined by the queue index. ++ */ ++ struct panthor_kernel_bo *syncobjs; ++ ++ /** @state: Group state. */ ++ enum panthor_group_state state; ++ ++ /** ++ * @suspend_buf: Suspend buffer. ++ * ++ * Stores the state of the group and its queues when a group is suspended. ++ * Used at resume time to restore the group in its previous state. ++ * ++ * The size of the suspend buffer is exposed through the FW interface. ++ */ ++ struct panthor_kernel_bo *suspend_buf; ++ ++ /** ++ * @protm_suspend_buf: Protection mode suspend buffer. ++ * ++ * Stores the state of the group and its queues when a group that's in ++ * protection mode is suspended. ++ * ++ * Used at resume time to restore the group in its previous state. ++ * ++ * The size of the protection mode suspend buffer is exposed through the ++ * FW interface. ++ */ ++ struct panthor_kernel_bo *protm_suspend_buf; ++ ++ /** @sync_upd_work: Work used to check/signal job fences. */ ++ struct work_struct sync_upd_work; ++ ++ /** @tiler_oom_work: Work used to process tiler OOM events happening on this group. */ ++ struct work_struct tiler_oom_work; ++ ++ /** @term_work: Work used to finish the group termination procedure. */ ++ struct work_struct term_work; ++ ++ /** ++ * @release_work: Work used to release group resources. ++ * ++ * We need to postpone the group release to avoid a deadlock when ++ * the last ref is released in the tick work. ++ */ ++ struct work_struct release_work; ++ ++ /** ++ * @run_node: Node used to insert the group in the ++ * panthor_group::groups::{runnable,idle} and ++ * panthor_group::reset.stopped_groups lists. ++ */ ++ struct list_head run_node; ++ ++ /** ++ * @wait_node: Node used to insert the group in the ++ * panthor_group::groups::waiting list. ++ */ ++ struct list_head wait_node; ++}; ++ ++/** ++ * group_queue_work() - Queue a group work ++ * @group: Group to queue the work for. ++ * @wname: Work name. ++ * ++ * Grabs a ref and queue a work item to the scheduler workqueue. If ++ * the work was already queued, we release the reference we grabbed. ++ * ++ * Work callbacks must release the reference we grabbed here. ++ */ ++#define group_queue_work(group, wname) \ ++ do { \ ++ group_get(group); \ ++ if (!queue_work((group)->ptdev->scheduler->wq, &(group)->wname ## _work)) \ ++ group_put(group); \ ++ } while (0) ++ ++/** ++ * sched_queue_work() - Queue a scheduler work. ++ * @sched: Scheduler object. ++ * @wname: Work name. ++ * ++ * Conditionally queues a scheduler work if no reset is pending/in-progress. ++ */ ++#define sched_queue_work(sched, wname) \ ++ do { \ ++ if (!atomic_read(&(sched)->reset.in_progress) && \ ++ !panthor_device_reset_is_pending((sched)->ptdev)) \ ++ queue_work((sched)->wq, &(sched)->wname ## _work); \ ++ } while (0) ++ ++/** ++ * sched_queue_delayed_work() - Queue a scheduler delayed work. ++ * @sched: Scheduler object. ++ * @wname: Work name. ++ * @delay: Work delay in jiffies. ++ * ++ * Conditionally queues a scheduler delayed work if no reset is ++ * pending/in-progress. ++ */ ++#define sched_queue_delayed_work(sched, wname, delay) \ ++ do { \ ++ if (!atomic_read(&sched->reset.in_progress) && \ ++ !panthor_device_reset_is_pending((sched)->ptdev)) \ ++ mod_delayed_work((sched)->wq, &(sched)->wname ## _work, delay); \ ++ } while (0) ++ ++/* ++ * We currently set the maximum of groups per file to an arbitrary low value. ++ * But this can be updated if we need more. ++ */ ++#define MAX_GROUPS_PER_POOL 128 ++ ++/** ++ * struct panthor_group_pool - Group pool ++ * ++ * Each file get assigned a group pool. ++ */ ++struct panthor_group_pool { ++ /** @xa: Xarray used to manage group handles. */ ++ struct xarray xa; ++}; ++ ++/** ++ * struct panthor_job - Used to manage GPU job ++ */ ++struct panthor_job { ++ /** @base: Inherit from drm_sched_job. */ ++ struct drm_sched_job base; ++ ++ /** @refcount: Reference count. */ ++ struct kref refcount; ++ ++ /** @group: Group of the queue this job will be pushed to. */ ++ struct panthor_group *group; ++ ++ /** @queue_idx: Index of the queue inside @group. */ ++ u32 queue_idx; ++ ++ /** @call_info: Information about the userspace command stream call. */ ++ struct { ++ /** @start: GPU address of the userspace command stream. */ ++ u64 start; ++ ++ /** @size: Size of the userspace command stream. */ ++ u32 size; ++ ++ /** ++ * @latest_flush: Flush ID at the time the userspace command ++ * stream was built. ++ * ++ * Needed for the flush reduction mechanism. ++ */ ++ u32 latest_flush; ++ } call_info; ++ ++ /** @ringbuf: Position of this job is in the ring buffer. */ ++ struct { ++ /** @start: Start offset. */ ++ u64 start; ++ ++ /** @end: End offset. */ ++ u64 end; ++ } ringbuf; ++ ++ /** ++ * @node: Used to insert the job in the panthor_queue::fence_ctx::in_flight_jobs ++ * list. ++ */ ++ struct list_head node; ++ ++ /** @done_fence: Fence signaled when the job is finished or cancelled. */ ++ struct dma_fence *done_fence; ++}; ++ ++static void ++panthor_queue_put_syncwait_obj(struct panthor_queue *queue) ++{ ++ if (queue->syncwait.kmap) { ++ struct iosys_map map = IOSYS_MAP_INIT_VADDR(queue->syncwait.kmap); ++ ++ drm_gem_vunmap_unlocked(queue->syncwait.obj, &map); ++ queue->syncwait.kmap = NULL; ++ } ++ ++ drm_gem_object_put(queue->syncwait.obj); ++ queue->syncwait.obj = NULL; ++} ++ ++static void * ++panthor_queue_get_syncwait_obj(struct panthor_group *group, struct panthor_queue *queue) ++{ ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_gem_object *bo; ++ struct iosys_map map; ++ int ret; ++ ++ if (queue->syncwait.kmap) ++ return queue->syncwait.kmap + queue->syncwait.offset; ++ ++ bo = panthor_vm_get_bo_for_va(group->vm, ++ queue->syncwait.gpu_va, ++ &queue->syncwait.offset); ++ if (drm_WARN_ON(&ptdev->base, IS_ERR_OR_NULL(bo))) ++ goto err_put_syncwait_obj; ++ ++ queue->syncwait.obj = &bo->base.base; ++ ret = drm_gem_vmap_unlocked(queue->syncwait.obj, &map); ++ if (drm_WARN_ON(&ptdev->base, ret)) ++ goto err_put_syncwait_obj; ++ ++ queue->syncwait.kmap = map.vaddr; ++ if (drm_WARN_ON(&ptdev->base, !queue->syncwait.kmap)) ++ goto err_put_syncwait_obj; ++ ++ return queue->syncwait.kmap + queue->syncwait.offset; ++ ++err_put_syncwait_obj: ++ panthor_queue_put_syncwait_obj(queue); ++ return NULL; ++} ++ ++static void group_free_queue(struct panthor_group *group, struct panthor_queue *queue) ++{ ++ if (IS_ERR_OR_NULL(queue)) ++ return; ++ ++ if (queue->entity.fence_context) ++ drm_sched_entity_destroy(&queue->entity); ++ ++ if (queue->scheduler.ops) ++ drm_sched_fini(&queue->scheduler); ++ ++ panthor_queue_put_syncwait_obj(queue); ++ ++ panthor_kernel_bo_destroy(group->vm, queue->ringbuf); ++ panthor_kernel_bo_destroy(panthor_fw_vm(group->ptdev), queue->iface.mem); ++ ++ kfree(queue); ++} ++ ++static void group_release_work(struct work_struct *work) ++{ ++ struct panthor_group *group = container_of(work, ++ struct panthor_group, ++ release_work); ++ struct panthor_device *ptdev = group->ptdev; ++ u32 i; ++ ++ for (i = 0; i < group->queue_count; i++) ++ group_free_queue(group, group->queues[i]); ++ ++ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->suspend_buf); ++ panthor_kernel_bo_destroy(panthor_fw_vm(ptdev), group->protm_suspend_buf); ++ panthor_kernel_bo_destroy(group->vm, group->syncobjs); ++ ++ panthor_vm_put(group->vm); ++ kfree(group); ++} ++ ++static void group_release(struct kref *kref) ++{ ++ struct panthor_group *group = container_of(kref, ++ struct panthor_group, ++ refcount); ++ struct panthor_device *ptdev = group->ptdev; ++ ++ drm_WARN_ON(&ptdev->base, group->csg_id >= 0); ++ drm_WARN_ON(&ptdev->base, !list_empty(&group->run_node)); ++ drm_WARN_ON(&ptdev->base, !list_empty(&group->wait_node)); ++ ++ queue_work(panthor_cleanup_wq, &group->release_work); ++} ++ ++static void group_put(struct panthor_group *group) ++{ ++ if (group) ++ kref_put(&group->refcount, group_release); ++} ++ ++static struct panthor_group * ++group_get(struct panthor_group *group) ++{ ++ if (group) ++ kref_get(&group->refcount); ++ ++ return group; ++} ++ ++/** ++ * group_bind_locked() - Bind a group to a group slot ++ * @group: Group. ++ * @csg_id: Slot. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++group_bind_locked(struct panthor_group *group, u32 csg_id) ++{ ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_csg_slot *csg_slot; ++ int ret; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ if (drm_WARN_ON(&ptdev->base, group->csg_id != -1 || csg_id >= MAX_CSGS || ++ ptdev->scheduler->csg_slots[csg_id].group)) ++ return -EINVAL; ++ ++ ret = panthor_vm_active(group->vm); ++ if (ret) ++ return ret; ++ ++ csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ group_get(group); ++ group->csg_id = csg_id; ++ ++ /* Dummy doorbell allocation: doorbell is assigned to the group and ++ * all queues use the same doorbell. ++ * ++ * TODO: Implement LRU-based doorbell assignment, so the most often ++ * updated queues get their own doorbell, thus avoiding useless checks ++ * on queues belonging to the same group that are rarely updated. ++ */ ++ for (u32 i = 0; i < group->queue_count; i++) ++ group->queues[i]->doorbell_id = csg_id + 1; ++ ++ csg_slot->group = group; ++ ++ return 0; ++} ++ ++/** ++ * group_unbind_locked() - Unbind a group from a slot. ++ * @group: Group to unbind. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++group_unbind_locked(struct panthor_group *group) ++{ ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_csg_slot *slot; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ if (drm_WARN_ON(&ptdev->base, group->csg_id < 0 || group->csg_id >= MAX_CSGS)) ++ return -EINVAL; ++ ++ if (drm_WARN_ON(&ptdev->base, group->state == PANTHOR_CS_GROUP_ACTIVE)) ++ return -EINVAL; ++ ++ slot = &ptdev->scheduler->csg_slots[group->csg_id]; ++ panthor_vm_idle(group->vm); ++ group->csg_id = -1; ++ ++ /* Tiler OOM events will be re-issued next time the group is scheduled. */ ++ atomic_set(&group->tiler_oom, 0); ++ cancel_work(&group->tiler_oom_work); ++ ++ for (u32 i = 0; i < group->queue_count; i++) ++ group->queues[i]->doorbell_id = -1; ++ ++ slot->group = NULL; ++ ++ group_put(group); ++ return 0; ++} ++ ++/** ++ * cs_slot_prog_locked() - Program a queue slot ++ * @ptdev: Device. ++ * @csg_id: Group slot ID. ++ * @cs_id: Queue slot ID. ++ * ++ * Program a queue slot with the queue information so things can start being ++ * executed on this queue. ++ * ++ * The group slot must have a group bound to it already (group_bind_locked()). ++ */ ++static void ++cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id) ++{ ++ struct panthor_queue *queue = ptdev->scheduler->csg_slots[csg_id].group->queues[cs_id]; ++ struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ queue->iface.input->extract = queue->iface.output->extract; ++ drm_WARN_ON(&ptdev->base, queue->iface.input->insert < queue->iface.input->extract); ++ ++ cs_iface->input->ringbuf_base = panthor_kernel_bo_gpuva(queue->ringbuf); ++ cs_iface->input->ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); ++ cs_iface->input->ringbuf_input = queue->iface.input_fw_va; ++ cs_iface->input->ringbuf_output = queue->iface.output_fw_va; ++ cs_iface->input->config = CS_CONFIG_PRIORITY(queue->priority) | ++ CS_CONFIG_DOORBELL(queue->doorbell_id); ++ cs_iface->input->ack_irq_mask = ~0; ++ panthor_fw_update_reqs(cs_iface, req, ++ CS_IDLE_SYNC_WAIT | ++ CS_IDLE_EMPTY | ++ CS_STATE_START | ++ CS_EXTRACT_EVENT, ++ CS_IDLE_SYNC_WAIT | ++ CS_IDLE_EMPTY | ++ CS_STATE_MASK | ++ CS_EXTRACT_EVENT); ++ if (queue->iface.input->insert != queue->iface.input->extract && queue->timeout_suspended) { ++ drm_sched_resume_timeout(&queue->scheduler, queue->remaining_time); ++ queue->timeout_suspended = false; ++ } ++} ++ ++/** ++ * @cs_slot_reset_locked() - Reset a queue slot ++ * @ptdev: Device. ++ * @csg_id: Group slot. ++ * @cs_id: Queue slot. ++ * ++ * Change the queue slot state to STOP and suspend the queue timeout if ++ * the queue is not blocked. ++ * ++ * The group slot must have a group bound to it (group_bind_locked()). ++ */ ++static int ++cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id) ++{ ++ struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group; ++ struct panthor_queue *queue = group->queues[cs_id]; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ panthor_fw_update_reqs(cs_iface, req, ++ CS_STATE_STOP, ++ CS_STATE_MASK); ++ ++ /* If the queue is blocked, we want to keep the timeout running, so ++ * we can detect unbounded waits and kill the group when that happens. ++ */ ++ if (!(group->blocked_queues & BIT(cs_id)) && !queue->timeout_suspended) { ++ queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler); ++ queue->timeout_suspended = true; ++ WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS)); ++ } ++ ++ return 0; ++} ++ ++/** ++ * csg_slot_sync_priority_locked() - Synchronize the group slot priority ++ * @ptdev: Device. ++ * @csg_id: Group slot ID. ++ * ++ * Group slot priority update happens asynchronously. When we receive a ++ * %CSG_ENDPOINT_CONFIG, we know the update is effective, and can ++ * reflect it to our panthor_csg_slot object. ++ */ ++static void ++csg_slot_sync_priority_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ struct panthor_fw_csg_iface *csg_iface; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ csg_slot->priority = (csg_iface->input->endpoint_req & CSG_EP_REQ_PRIORITY_MASK) >> 28; ++} ++ ++/** ++ * cs_slot_sync_queue_state_locked() - Synchronize the queue slot priority ++ * @ptdev: Device. ++ * @csg_id: Group slot. ++ * @cs_id: Queue slot. ++ * ++ * Queue state is updated on group suspend or STATUS_UPDATE event. ++ */ ++static void ++cs_slot_sync_queue_state_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id) ++{ ++ struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group; ++ struct panthor_queue *queue = group->queues[cs_id]; ++ struct panthor_fw_cs_iface *cs_iface = ++ panthor_fw_get_cs_iface(group->ptdev, csg_id, cs_id); ++ ++ u32 status_wait_cond; ++ ++ switch (cs_iface->output->status_blocked_reason) { ++ case CS_STATUS_BLOCKED_REASON_UNBLOCKED: ++ if (queue->iface.input->insert == queue->iface.output->extract && ++ cs_iface->output->status_scoreboards == 0) ++ group->idle_queues |= BIT(cs_id); ++ break; ++ ++ case CS_STATUS_BLOCKED_REASON_SYNC_WAIT: ++ if (list_empty(&group->wait_node)) { ++ list_move_tail(&group->wait_node, ++ &group->ptdev->scheduler->groups.waiting); ++ } ++ group->blocked_queues |= BIT(cs_id); ++ queue->syncwait.gpu_va = cs_iface->output->status_wait_sync_ptr; ++ queue->syncwait.ref = cs_iface->output->status_wait_sync_value; ++ status_wait_cond = cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_COND_MASK; ++ queue->syncwait.gt = status_wait_cond == CS_STATUS_WAIT_SYNC_COND_GT; ++ if (cs_iface->output->status_wait & CS_STATUS_WAIT_SYNC_64B) { ++ u64 sync_val_hi = cs_iface->output->status_wait_sync_value_hi; ++ ++ queue->syncwait.sync64 = true; ++ queue->syncwait.ref |= sync_val_hi << 32; ++ } else { ++ queue->syncwait.sync64 = false; ++ } ++ break; ++ ++ default: ++ /* Other reasons are not blocking. Consider the queue as runnable ++ * in those cases. ++ */ ++ break; ++ } ++} ++ ++static void ++csg_slot_sync_queues_state_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ struct panthor_group *group = csg_slot->group; ++ u32 i; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ group->idle_queues = 0; ++ group->blocked_queues = 0; ++ ++ for (i = 0; i < group->queue_count; i++) { ++ if (group->queues[i]) ++ cs_slot_sync_queue_state_locked(ptdev, csg_id, i); ++ } ++} ++ ++static void ++csg_slot_sync_state_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ struct panthor_fw_csg_iface *csg_iface; ++ struct panthor_group *group; ++ enum panthor_group_state new_state, old_state; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ group = csg_slot->group; ++ ++ if (!group) ++ return; ++ ++ old_state = group->state; ++ switch (csg_iface->output->ack & CSG_STATE_MASK) { ++ case CSG_STATE_START: ++ case CSG_STATE_RESUME: ++ new_state = PANTHOR_CS_GROUP_ACTIVE; ++ break; ++ case CSG_STATE_TERMINATE: ++ new_state = PANTHOR_CS_GROUP_TERMINATED; ++ break; ++ case CSG_STATE_SUSPEND: ++ new_state = PANTHOR_CS_GROUP_SUSPENDED; ++ break; ++ } ++ ++ if (old_state == new_state) ++ return; ++ ++ if (new_state == PANTHOR_CS_GROUP_SUSPENDED) ++ csg_slot_sync_queues_state_locked(ptdev, csg_id); ++ ++ if (old_state == PANTHOR_CS_GROUP_ACTIVE) { ++ u32 i; ++ ++ /* Reset the queue slots so we start from a clean ++ * state when starting/resuming a new group on this ++ * CSG slot. No wait needed here, and no ringbell ++ * either, since the CS slot will only be re-used ++ * on the next CSG start operation. ++ */ ++ for (i = 0; i < group->queue_count; i++) { ++ if (group->queues[i]) ++ cs_slot_reset_locked(ptdev, csg_id, i); ++ } ++ } ++ ++ group->state = new_state; ++} ++ ++static int ++csg_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 priority) ++{ ++ struct panthor_fw_csg_iface *csg_iface; ++ struct panthor_csg_slot *csg_slot; ++ struct panthor_group *group; ++ u32 queue_mask = 0, i; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ if (priority > MAX_CSG_PRIO) ++ return -EINVAL; ++ ++ if (drm_WARN_ON(&ptdev->base, csg_id >= MAX_CSGS)) ++ return -EINVAL; ++ ++ csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ group = csg_slot->group; ++ if (!group || group->state == PANTHOR_CS_GROUP_ACTIVE) ++ return 0; ++ ++ csg_iface = panthor_fw_get_csg_iface(group->ptdev, csg_id); ++ ++ for (i = 0; i < group->queue_count; i++) { ++ if (group->queues[i]) { ++ cs_slot_prog_locked(ptdev, csg_id, i); ++ queue_mask |= BIT(i); ++ } ++ } ++ ++ csg_iface->input->allow_compute = group->compute_core_mask; ++ csg_iface->input->allow_fragment = group->fragment_core_mask; ++ csg_iface->input->allow_other = group->tiler_core_mask; ++ csg_iface->input->endpoint_req = CSG_EP_REQ_COMPUTE(group->max_compute_cores) | ++ CSG_EP_REQ_FRAGMENT(group->max_fragment_cores) | ++ CSG_EP_REQ_TILER(group->max_tiler_cores) | ++ CSG_EP_REQ_PRIORITY(priority); ++ csg_iface->input->config = panthor_vm_as(group->vm); ++ ++ if (group->suspend_buf) ++ csg_iface->input->suspend_buf = panthor_kernel_bo_gpuva(group->suspend_buf); ++ else ++ csg_iface->input->suspend_buf = 0; ++ ++ if (group->protm_suspend_buf) { ++ csg_iface->input->protm_suspend_buf = ++ panthor_kernel_bo_gpuva(group->protm_suspend_buf); ++ } else { ++ csg_iface->input->protm_suspend_buf = 0; ++ } ++ ++ csg_iface->input->ack_irq_mask = ~0; ++ panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, queue_mask); ++ return 0; ++} ++ ++static void ++cs_slot_process_fatal_event_locked(struct panthor_device *ptdev, ++ u32 csg_id, u32 cs_id) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id]; ++ struct panthor_group *group = csg_slot->group; ++ struct panthor_fw_cs_iface *cs_iface; ++ u32 fatal; ++ u64 info; ++ ++ lockdep_assert_held(&sched->lock); ++ ++ cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ fatal = cs_iface->output->fatal; ++ info = cs_iface->output->fatal_info; ++ ++ if (group) ++ group->fatal_queues |= BIT(cs_id); ++ ++ sched_queue_delayed_work(sched, tick, 0); ++ drm_warn(&ptdev->base, ++ "CSG slot %d CS slot: %d\n" ++ "CS_FATAL.EXCEPTION_TYPE: 0x%x (%s)\n" ++ "CS_FATAL.EXCEPTION_DATA: 0x%x\n" ++ "CS_FATAL_INFO.EXCEPTION_DATA: 0x%llx\n", ++ csg_id, cs_id, ++ (unsigned int)CS_EXCEPTION_TYPE(fatal), ++ panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fatal)), ++ (unsigned int)CS_EXCEPTION_DATA(fatal), ++ info); ++} ++ ++static void ++cs_slot_process_fault_event_locked(struct panthor_device *ptdev, ++ u32 csg_id, u32 cs_id) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id]; ++ struct panthor_group *group = csg_slot->group; ++ struct panthor_queue *queue = group && cs_id < group->queue_count ? ++ group->queues[cs_id] : NULL; ++ struct panthor_fw_cs_iface *cs_iface; ++ u32 fault; ++ u64 info; ++ ++ lockdep_assert_held(&sched->lock); ++ ++ cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ fault = cs_iface->output->fault; ++ info = cs_iface->output->fault_info; ++ ++ if (queue && CS_EXCEPTION_TYPE(fault) == DRM_PANTHOR_EXCEPTION_CS_INHERIT_FAULT) { ++ u64 cs_extract = queue->iface.output->extract; ++ struct panthor_job *job; ++ ++ spin_lock(&queue->fence_ctx.lock); ++ list_for_each_entry(job, &queue->fence_ctx.in_flight_jobs, node) { ++ if (cs_extract >= job->ringbuf.end) ++ continue; ++ ++ if (cs_extract < job->ringbuf.start) ++ break; ++ ++ dma_fence_set_error(job->done_fence, -EINVAL); ++ } ++ spin_unlock(&queue->fence_ctx.lock); ++ } ++ ++ drm_warn(&ptdev->base, ++ "CSG slot %d CS slot: %d\n" ++ "CS_FAULT.EXCEPTION_TYPE: 0x%x (%s)\n" ++ "CS_FAULT.EXCEPTION_DATA: 0x%x\n" ++ "CS_FAULT_INFO.EXCEPTION_DATA: 0x%llx\n", ++ csg_id, cs_id, ++ (unsigned int)CS_EXCEPTION_TYPE(fault), ++ panthor_exception_name(ptdev, CS_EXCEPTION_TYPE(fault)), ++ (unsigned int)CS_EXCEPTION_DATA(fault), ++ info); ++} ++ ++static int group_process_tiler_oom(struct panthor_group *group, u32 cs_id) ++{ ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ u32 renderpasses_in_flight, pending_frag_count; ++ struct panthor_heap_pool *heaps = NULL; ++ u64 heap_address, new_chunk_va = 0; ++ u32 vt_start, vt_end, frag_end; ++ int ret, csg_id; ++ ++ mutex_lock(&sched->lock); ++ csg_id = group->csg_id; ++ if (csg_id >= 0) { ++ struct panthor_fw_cs_iface *cs_iface; ++ ++ cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ heaps = panthor_vm_get_heap_pool(group->vm, false); ++ heap_address = cs_iface->output->heap_address; ++ vt_start = cs_iface->output->heap_vt_start; ++ vt_end = cs_iface->output->heap_vt_end; ++ frag_end = cs_iface->output->heap_frag_end; ++ renderpasses_in_flight = vt_start - frag_end; ++ pending_frag_count = vt_end - frag_end; ++ } ++ mutex_unlock(&sched->lock); ++ ++ /* The group got scheduled out, we stop here. We will get a new tiler OOM event ++ * when it's scheduled again. ++ */ ++ if (unlikely(csg_id < 0)) ++ return 0; ++ ++ if (!heaps || frag_end > vt_end || vt_end >= vt_start) { ++ ret = -EINVAL; ++ } else { ++ /* We do the allocation without holding the scheduler lock to avoid ++ * blocking the scheduling. ++ */ ++ ret = panthor_heap_grow(heaps, heap_address, ++ renderpasses_in_flight, ++ pending_frag_count, &new_chunk_va); ++ } ++ ++ if (ret && ret != -EBUSY) { ++ drm_warn(&ptdev->base, "Failed to extend the tiler heap\n"); ++ group->fatal_queues |= BIT(cs_id); ++ sched_queue_delayed_work(sched, tick, 0); ++ goto out_put_heap_pool; ++ } ++ ++ mutex_lock(&sched->lock); ++ csg_id = group->csg_id; ++ if (csg_id >= 0) { ++ struct panthor_fw_csg_iface *csg_iface; ++ struct panthor_fw_cs_iface *cs_iface; ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ ++ cs_iface->input->heap_start = new_chunk_va; ++ cs_iface->input->heap_end = new_chunk_va; ++ panthor_fw_update_reqs(cs_iface, req, cs_iface->output->ack, CS_TILER_OOM); ++ panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, BIT(cs_id)); ++ panthor_fw_ring_csg_doorbells(ptdev, BIT(csg_id)); ++ } ++ mutex_unlock(&sched->lock); ++ ++ /* We allocated a chunck, but couldn't link it to the heap ++ * context because the group was scheduled out while we were ++ * allocating memory. We need to return this chunk to the heap. ++ */ ++ if (unlikely(csg_id < 0 && new_chunk_va)) ++ panthor_heap_return_chunk(heaps, heap_address, new_chunk_va); ++ ++ ret = 0; ++ ++out_put_heap_pool: ++ panthor_heap_pool_put(heaps); ++ return ret; ++} ++ ++static void group_tiler_oom_work(struct work_struct *work) ++{ ++ struct panthor_group *group = ++ container_of(work, struct panthor_group, tiler_oom_work); ++ u32 tiler_oom = atomic_xchg(&group->tiler_oom, 0); ++ ++ while (tiler_oom) { ++ u32 cs_id = ffs(tiler_oom) - 1; ++ ++ group_process_tiler_oom(group, cs_id); ++ tiler_oom &= ~BIT(cs_id); ++ } ++ ++ group_put(group); ++} ++ ++static void ++cs_slot_process_tiler_oom_event_locked(struct panthor_device *ptdev, ++ u32 csg_id, u32 cs_id) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id]; ++ struct panthor_group *group = csg_slot->group; ++ ++ lockdep_assert_held(&sched->lock); ++ ++ if (drm_WARN_ON(&ptdev->base, !group)) ++ return; ++ ++ atomic_or(BIT(cs_id), &group->tiler_oom); ++ ++ /* We don't use group_queue_work() here because we want to queue the ++ * work item to the heap_alloc_wq. ++ */ ++ group_get(group); ++ if (!queue_work(sched->heap_alloc_wq, &group->tiler_oom_work)) ++ group_put(group); ++} ++ ++static bool cs_slot_process_irq_locked(struct panthor_device *ptdev, ++ u32 csg_id, u32 cs_id) ++{ ++ struct panthor_fw_cs_iface *cs_iface; ++ u32 req, ack, events; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); ++ req = cs_iface->input->req; ++ ack = cs_iface->output->ack; ++ events = (req ^ ack) & CS_EVT_MASK; ++ ++ if (events & CS_FATAL) ++ cs_slot_process_fatal_event_locked(ptdev, csg_id, cs_id); ++ ++ if (events & CS_FAULT) ++ cs_slot_process_fault_event_locked(ptdev, csg_id, cs_id); ++ ++ if (events & CS_TILER_OOM) ++ cs_slot_process_tiler_oom_event_locked(ptdev, csg_id, cs_id); ++ ++ /* We don't acknowledge the TILER_OOM event since its handling is ++ * deferred to a separate work. ++ */ ++ panthor_fw_update_reqs(cs_iface, req, ack, CS_FATAL | CS_FAULT); ++ ++ return (events & (CS_FAULT | CS_TILER_OOM)) != 0; ++} ++ ++static void csg_slot_sync_idle_state_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ struct panthor_fw_csg_iface *csg_iface; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ csg_slot->idle = csg_iface->output->status_state & CSG_STATUS_STATE_IS_IDLE; ++} ++ ++static void csg_slot_process_idle_event_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ ++ lockdep_assert_held(&sched->lock); ++ ++ sched->might_have_idle_groups = true; ++ ++ /* Schedule a tick so we can evict idle groups and schedule non-idle ++ * ones. This will also update runtime PM and devfreq busy/idle states, ++ * so the device can lower its frequency or get suspended. ++ */ ++ sched_queue_delayed_work(sched, tick, 0); ++} ++ ++static void csg_slot_sync_update_locked(struct panthor_device *ptdev, ++ u32 csg_id) ++{ ++ struct panthor_csg_slot *csg_slot = &ptdev->scheduler->csg_slots[csg_id]; ++ struct panthor_group *group = csg_slot->group; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ if (group) ++ group_queue_work(group, sync_upd); ++ ++ sched_queue_work(ptdev->scheduler, sync_upd); ++} ++ ++static void ++csg_slot_process_progress_timer_event_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id]; ++ struct panthor_group *group = csg_slot->group; ++ ++ lockdep_assert_held(&sched->lock); ++ ++ drm_warn(&ptdev->base, "CSG slot %d progress timeout\n", csg_id); ++ ++ group = csg_slot->group; ++ if (!drm_WARN_ON(&ptdev->base, !group)) ++ group->timedout = true; ++ ++ sched_queue_delayed_work(sched, tick, 0); ++} ++ ++static void sched_process_csg_irq_locked(struct panthor_device *ptdev, u32 csg_id) ++{ ++ u32 req, ack, cs_irq_req, cs_irq_ack, cs_irqs, csg_events; ++ struct panthor_fw_csg_iface *csg_iface; ++ u32 ring_cs_db_mask = 0; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ if (drm_WARN_ON(&ptdev->base, csg_id >= ptdev->scheduler->csg_slot_count)) ++ return; ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ req = READ_ONCE(csg_iface->input->req); ++ ack = READ_ONCE(csg_iface->output->ack); ++ cs_irq_req = READ_ONCE(csg_iface->output->cs_irq_req); ++ cs_irq_ack = READ_ONCE(csg_iface->input->cs_irq_ack); ++ csg_events = (req ^ ack) & CSG_EVT_MASK; ++ ++ /* There may not be any pending CSG/CS interrupts to process */ ++ if (req == ack && cs_irq_req == cs_irq_ack) ++ return; ++ ++ /* Immediately set IRQ_ACK bits to be same as the IRQ_REQ bits before ++ * examining the CS_ACK & CS_REQ bits. This would ensure that Host ++ * doesn't miss an interrupt for the CS in the race scenario where ++ * whilst Host is servicing an interrupt for the CS, firmware sends ++ * another interrupt for that CS. ++ */ ++ csg_iface->input->cs_irq_ack = cs_irq_req; ++ ++ panthor_fw_update_reqs(csg_iface, req, ack, ++ CSG_SYNC_UPDATE | ++ CSG_IDLE | ++ CSG_PROGRESS_TIMER_EVENT); ++ ++ if (csg_events & CSG_IDLE) ++ csg_slot_process_idle_event_locked(ptdev, csg_id); ++ ++ if (csg_events & CSG_PROGRESS_TIMER_EVENT) ++ csg_slot_process_progress_timer_event_locked(ptdev, csg_id); ++ ++ cs_irqs = cs_irq_req ^ cs_irq_ack; ++ while (cs_irqs) { ++ u32 cs_id = ffs(cs_irqs) - 1; ++ ++ if (cs_slot_process_irq_locked(ptdev, csg_id, cs_id)) ++ ring_cs_db_mask |= BIT(cs_id); ++ ++ cs_irqs &= ~BIT(cs_id); ++ } ++ ++ if (csg_events & CSG_SYNC_UPDATE) ++ csg_slot_sync_update_locked(ptdev, csg_id); ++ ++ if (ring_cs_db_mask) ++ panthor_fw_toggle_reqs(csg_iface, doorbell_req, doorbell_ack, ring_cs_db_mask); ++ ++ panthor_fw_ring_csg_doorbells(ptdev, BIT(csg_id)); ++} ++ ++static void sched_process_idle_event_locked(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ /* Acknowledge the idle event and schedule a tick. */ ++ panthor_fw_update_reqs(glb_iface, req, glb_iface->output->ack, GLB_IDLE); ++ sched_queue_delayed_work(ptdev->scheduler, tick, 0); ++} ++ ++/** ++ * panthor_sched_process_global_irq() - Process the scheduling part of a global IRQ ++ * @ptdev: Device. ++ */ ++static void sched_process_global_irq_locked(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ u32 req, ack, evts; ++ ++ lockdep_assert_held(&ptdev->scheduler->lock); ++ ++ req = READ_ONCE(glb_iface->input->req); ++ ack = READ_ONCE(glb_iface->output->ack); ++ evts = (req ^ ack) & GLB_EVT_MASK; ++ ++ if (evts & GLB_IDLE) ++ sched_process_idle_event_locked(ptdev); ++} ++ ++static void process_fw_events_work(struct work_struct *work) ++{ ++ struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler, ++ fw_events_work); ++ u32 events = atomic_xchg(&sched->fw_events, 0); ++ struct panthor_device *ptdev = sched->ptdev; ++ ++ mutex_lock(&sched->lock); ++ ++ if (events & JOB_INT_GLOBAL_IF) { ++ sched_process_global_irq_locked(ptdev); ++ events &= ~JOB_INT_GLOBAL_IF; ++ } ++ ++ while (events) { ++ u32 csg_id = ffs(events) - 1; ++ ++ sched_process_csg_irq_locked(ptdev, csg_id); ++ events &= ~BIT(csg_id); ++ } ++ ++ mutex_unlock(&sched->lock); ++} ++ ++/** ++ * panthor_sched_report_fw_events() - Report FW events to the scheduler. ++ */ ++void panthor_sched_report_fw_events(struct panthor_device *ptdev, u32 events) ++{ ++ if (!ptdev->scheduler) ++ return; ++ ++ atomic_or(events, &ptdev->scheduler->fw_events); ++ sched_queue_work(ptdev->scheduler, fw_events); ++} ++ ++static const char *fence_get_driver_name(struct dma_fence *fence) ++{ ++ return "panthor"; ++} ++ ++static const char *queue_fence_get_timeline_name(struct dma_fence *fence) ++{ ++ return "queue-fence"; ++} ++ ++static const struct dma_fence_ops panthor_queue_fence_ops = { ++ .get_driver_name = fence_get_driver_name, ++ .get_timeline_name = queue_fence_get_timeline_name, ++}; ++ ++/** ++ */ ++struct panthor_csg_slots_upd_ctx { ++ u32 update_mask; ++ u32 timedout_mask; ++ struct { ++ u32 value; ++ u32 mask; ++ } requests[MAX_CSGS]; ++}; ++ ++static void csgs_upd_ctx_init(struct panthor_csg_slots_upd_ctx *ctx) ++{ ++ memset(ctx, 0, sizeof(*ctx)); ++} ++ ++static void csgs_upd_ctx_queue_reqs(struct panthor_device *ptdev, ++ struct panthor_csg_slots_upd_ctx *ctx, ++ u32 csg_id, u32 value, u32 mask) ++{ ++ if (drm_WARN_ON(&ptdev->base, !mask) || ++ drm_WARN_ON(&ptdev->base, csg_id >= ptdev->scheduler->csg_slot_count)) ++ return; ++ ++ ctx->requests[csg_id].value = (ctx->requests[csg_id].value & ~mask) | (value & mask); ++ ctx->requests[csg_id].mask |= mask; ++ ctx->update_mask |= BIT(csg_id); ++} ++ ++static int csgs_upd_ctx_apply_locked(struct panthor_device *ptdev, ++ struct panthor_csg_slots_upd_ctx *ctx) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ u32 update_slots = ctx->update_mask; ++ ++ lockdep_assert_held(&sched->lock); ++ ++ if (!ctx->update_mask) ++ return 0; ++ ++ while (update_slots) { ++ struct panthor_fw_csg_iface *csg_iface; ++ u32 csg_id = ffs(update_slots) - 1; ++ ++ update_slots &= ~BIT(csg_id); ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ panthor_fw_update_reqs(csg_iface, req, ++ ctx->requests[csg_id].value, ++ ctx->requests[csg_id].mask); ++ } ++ ++ panthor_fw_ring_csg_doorbells(ptdev, ctx->update_mask); ++ ++ update_slots = ctx->update_mask; ++ while (update_slots) { ++ struct panthor_fw_csg_iface *csg_iface; ++ u32 csg_id = ffs(update_slots) - 1; ++ u32 req_mask = ctx->requests[csg_id].mask, acked; ++ int ret; ++ ++ update_slots &= ~BIT(csg_id); ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ ++ ret = panthor_fw_csg_wait_acks(ptdev, csg_id, req_mask, &acked, 100); ++ ++ if (acked & CSG_ENDPOINT_CONFIG) ++ csg_slot_sync_priority_locked(ptdev, csg_id); ++ ++ if (acked & CSG_STATE_MASK) ++ csg_slot_sync_state_locked(ptdev, csg_id); ++ ++ if (acked & CSG_STATUS_UPDATE) { ++ csg_slot_sync_queues_state_locked(ptdev, csg_id); ++ csg_slot_sync_idle_state_locked(ptdev, csg_id); ++ } ++ ++ if (ret && acked != req_mask && ++ ((csg_iface->input->req ^ csg_iface->output->ack) & req_mask) != 0) { ++ drm_err(&ptdev->base, "CSG %d update request timedout", csg_id); ++ ctx->timedout_mask |= BIT(csg_id); ++ } ++ } ++ ++ if (ctx->timedout_mask) ++ return -ETIMEDOUT; ++ ++ return 0; ++} ++ ++struct panthor_sched_tick_ctx { ++ struct list_head old_groups[PANTHOR_CSG_PRIORITY_COUNT]; ++ struct list_head groups[PANTHOR_CSG_PRIORITY_COUNT]; ++ u32 idle_group_count; ++ u32 group_count; ++ enum panthor_csg_priority min_priority; ++ struct panthor_vm *vms[MAX_CS_PER_CSG]; ++ u32 as_count; ++ bool immediate_tick; ++ u32 csg_upd_failed_mask; ++}; ++ ++static bool ++tick_ctx_is_full(const struct panthor_scheduler *sched, ++ const struct panthor_sched_tick_ctx *ctx) ++{ ++ return ctx->group_count == sched->csg_slot_count; ++} ++ ++static bool ++group_is_idle(struct panthor_group *group) ++{ ++ struct panthor_device *ptdev = group->ptdev; ++ u32 inactive_queues; ++ ++ if (group->csg_id >= 0) ++ return ptdev->scheduler->csg_slots[group->csg_id].idle; ++ ++ inactive_queues = group->idle_queues | group->blocked_queues; ++ return hweight32(inactive_queues) == group->queue_count; ++} ++ ++static bool ++group_can_run(struct panthor_group *group) ++{ ++ return group->state != PANTHOR_CS_GROUP_TERMINATED && ++ !group->destroyed && group->fatal_queues == 0 && ++ !group->timedout; ++} ++ ++static void ++tick_ctx_pick_groups_from_list(const struct panthor_scheduler *sched, ++ struct panthor_sched_tick_ctx *ctx, ++ struct list_head *queue, ++ bool skip_idle_groups, ++ bool owned_by_tick_ctx) ++{ ++ struct panthor_group *group, *tmp; ++ ++ if (tick_ctx_is_full(sched, ctx)) ++ return; ++ ++ list_for_each_entry_safe(group, tmp, queue, run_node) { ++ u32 i; ++ ++ if (!group_can_run(group)) ++ continue; ++ ++ if (skip_idle_groups && group_is_idle(group)) ++ continue; ++ ++ for (i = 0; i < ctx->as_count; i++) { ++ if (ctx->vms[i] == group->vm) ++ break; ++ } ++ ++ if (i == ctx->as_count && ctx->as_count == sched->as_slot_count) ++ continue; ++ ++ if (!owned_by_tick_ctx) ++ group_get(group); ++ ++ list_move_tail(&group->run_node, &ctx->groups[group->priority]); ++ ctx->group_count++; ++ if (group_is_idle(group)) ++ ctx->idle_group_count++; ++ ++ if (i == ctx->as_count) ++ ctx->vms[ctx->as_count++] = group->vm; ++ ++ if (ctx->min_priority > group->priority) ++ ctx->min_priority = group->priority; ++ ++ if (tick_ctx_is_full(sched, ctx)) ++ return; ++ } ++} ++ ++static void ++tick_ctx_insert_old_group(struct panthor_scheduler *sched, ++ struct panthor_sched_tick_ctx *ctx, ++ struct panthor_group *group, ++ bool full_tick) ++{ ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[group->csg_id]; ++ struct panthor_group *other_group; ++ ++ if (!full_tick) { ++ list_add_tail(&group->run_node, &ctx->old_groups[group->priority]); ++ return; ++ } ++ ++ /* Rotate to make sure groups with lower CSG slot ++ * priorities have a chance to get a higher CSG slot ++ * priority next time they get picked. This priority ++ * has an impact on resource request ordering, so it's ++ * important to make sure we don't let one group starve ++ * all other groups with the same group priority. ++ */ ++ list_for_each_entry(other_group, ++ &ctx->old_groups[csg_slot->group->priority], ++ run_node) { ++ struct panthor_csg_slot *other_csg_slot = &sched->csg_slots[other_group->csg_id]; ++ ++ if (other_csg_slot->priority > csg_slot->priority) { ++ list_add_tail(&csg_slot->group->run_node, &other_group->run_node); ++ return; ++ } ++ } ++ ++ list_add_tail(&group->run_node, &ctx->old_groups[group->priority]); ++} ++ ++static void ++tick_ctx_init(struct panthor_scheduler *sched, ++ struct panthor_sched_tick_ctx *ctx, ++ bool full_tick) ++{ ++ struct panthor_device *ptdev = sched->ptdev; ++ struct panthor_csg_slots_upd_ctx upd_ctx; ++ int ret; ++ u32 i; ++ ++ memset(ctx, 0, sizeof(*ctx)); ++ csgs_upd_ctx_init(&upd_ctx); ++ ++ ctx->min_priority = PANTHOR_CSG_PRIORITY_COUNT; ++ for (i = 0; i < ARRAY_SIZE(ctx->groups); i++) { ++ INIT_LIST_HEAD(&ctx->groups[i]); ++ INIT_LIST_HEAD(&ctx->old_groups[i]); ++ } ++ ++ for (i = 0; i < sched->csg_slot_count; i++) { ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[i]; ++ struct panthor_group *group = csg_slot->group; ++ struct panthor_fw_csg_iface *csg_iface; ++ ++ if (!group) ++ continue; ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, i); ++ group_get(group); ++ ++ /* If there was unhandled faults on the VM, force processing of ++ * CSG IRQs, so we can flag the faulty queue. ++ */ ++ if (panthor_vm_has_unhandled_faults(group->vm)) { ++ sched_process_csg_irq_locked(ptdev, i); ++ ++ /* No fatal fault reported, flag all queues as faulty. */ ++ if (!group->fatal_queues) ++ group->fatal_queues |= GENMASK(group->queue_count - 1, 0); ++ } ++ ++ tick_ctx_insert_old_group(sched, ctx, group, full_tick); ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i, ++ csg_iface->output->ack ^ CSG_STATUS_UPDATE, ++ CSG_STATUS_UPDATE); ++ } ++ ++ ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx); ++ if (ret) { ++ panthor_device_schedule_reset(ptdev); ++ ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask; ++ } ++} ++ ++#define NUM_INSTRS_PER_SLOT 16 ++ ++static void ++group_term_post_processing(struct panthor_group *group) ++{ ++ struct panthor_job *job, *tmp; ++ LIST_HEAD(faulty_jobs); ++ bool cookie; ++ u32 i = 0; ++ ++ if (drm_WARN_ON(&group->ptdev->base, group_can_run(group))) ++ return; ++ ++ cookie = dma_fence_begin_signalling(); ++ for (i = 0; i < group->queue_count; i++) { ++ struct panthor_queue *queue = group->queues[i]; ++ struct panthor_syncobj_64b *syncobj; ++ int err; ++ ++ if (group->fatal_queues & BIT(i)) ++ err = -EINVAL; ++ else if (group->timedout) ++ err = -ETIMEDOUT; ++ else ++ err = -ECANCELED; ++ ++ if (!queue) ++ continue; ++ ++ spin_lock(&queue->fence_ctx.lock); ++ list_for_each_entry_safe(job, tmp, &queue->fence_ctx.in_flight_jobs, node) { ++ list_move_tail(&job->node, &faulty_jobs); ++ dma_fence_set_error(job->done_fence, err); ++ dma_fence_signal_locked(job->done_fence); ++ } ++ spin_unlock(&queue->fence_ctx.lock); ++ ++ /* Manually update the syncobj seqno to unblock waiters. */ ++ syncobj = group->syncobjs->kmap + (i * sizeof(*syncobj)); ++ syncobj->status = ~0; ++ syncobj->seqno = atomic64_read(&queue->fence_ctx.seqno); ++ sched_queue_work(group->ptdev->scheduler, sync_upd); ++ } ++ dma_fence_end_signalling(cookie); ++ ++ list_for_each_entry_safe(job, tmp, &faulty_jobs, node) { ++ list_del_init(&job->node); ++ panthor_job_put(&job->base); ++ } ++} ++ ++static void group_term_work(struct work_struct *work) ++{ ++ struct panthor_group *group = ++ container_of(work, struct panthor_group, term_work); ++ ++ group_term_post_processing(group); ++ group_put(group); ++} ++ ++static void ++tick_ctx_cleanup(struct panthor_scheduler *sched, ++ struct panthor_sched_tick_ctx *ctx) ++{ ++ struct panthor_group *group, *tmp; ++ u32 i; ++ ++ for (i = 0; i < ARRAY_SIZE(ctx->old_groups); i++) { ++ list_for_each_entry_safe(group, tmp, &ctx->old_groups[i], run_node) { ++ /* If everything went fine, we should only have groups ++ * to be terminated in the old_groups lists. ++ */ ++ drm_WARN_ON(&group->ptdev->base, !ctx->csg_upd_failed_mask && ++ group_can_run(group)); ++ ++ if (!group_can_run(group)) { ++ list_del_init(&group->run_node); ++ list_del_init(&group->wait_node); ++ group_queue_work(group, term); ++ } else if (group->csg_id >= 0) { ++ list_del_init(&group->run_node); ++ } else { ++ list_move(&group->run_node, ++ group_is_idle(group) ? ++ &sched->groups.idle[group->priority] : ++ &sched->groups.runnable[group->priority]); ++ } ++ group_put(group); ++ } ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(ctx->groups); i++) { ++ /* If everything went fine, the groups to schedule lists should ++ * be empty. ++ */ ++ drm_WARN_ON(&group->ptdev->base, ++ !ctx->csg_upd_failed_mask && !list_empty(&ctx->groups[i])); ++ ++ list_for_each_entry_safe(group, tmp, &ctx->groups[i], run_node) { ++ if (group->csg_id >= 0) { ++ list_del_init(&group->run_node); ++ } else { ++ list_move(&group->run_node, ++ group_is_idle(group) ? ++ &sched->groups.idle[group->priority] : ++ &sched->groups.runnable[group->priority]); ++ } ++ group_put(group); ++ } ++ } ++} ++ ++static void ++tick_ctx_apply(struct panthor_scheduler *sched, struct panthor_sched_tick_ctx *ctx) ++{ ++ struct panthor_group *group, *tmp; ++ struct panthor_device *ptdev = sched->ptdev; ++ struct panthor_csg_slot *csg_slot; ++ int prio, new_csg_prio = MAX_CSG_PRIO, i; ++ u32 csg_mod_mask = 0, free_csg_slots = 0; ++ struct panthor_csg_slots_upd_ctx upd_ctx; ++ int ret; ++ ++ csgs_upd_ctx_init(&upd_ctx); ++ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) { ++ /* Suspend or terminate evicted groups. */ ++ list_for_each_entry(group, &ctx->old_groups[prio], run_node) { ++ bool term = !group_can_run(group); ++ int csg_id = group->csg_id; ++ ++ if (drm_WARN_ON(&ptdev->base, csg_id < 0)) ++ continue; ++ ++ csg_slot = &sched->csg_slots[csg_id]; ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id, ++ term ? CSG_STATE_TERMINATE : CSG_STATE_SUSPEND, ++ CSG_STATE_MASK); ++ } ++ ++ /* Update priorities on already running groups. */ ++ list_for_each_entry(group, &ctx->groups[prio], run_node) { ++ struct panthor_fw_csg_iface *csg_iface; ++ int csg_id = group->csg_id; ++ ++ if (csg_id < 0) { ++ new_csg_prio--; ++ continue; ++ } ++ ++ csg_slot = &sched->csg_slots[csg_id]; ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ if (csg_slot->priority == new_csg_prio) { ++ new_csg_prio--; ++ continue; ++ } ++ ++ panthor_fw_update_reqs(csg_iface, endpoint_req, ++ CSG_EP_REQ_PRIORITY(new_csg_prio), ++ CSG_EP_REQ_PRIORITY_MASK); ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id, ++ csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG, ++ CSG_ENDPOINT_CONFIG); ++ new_csg_prio--; ++ } ++ } ++ ++ ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx); ++ if (ret) { ++ panthor_device_schedule_reset(ptdev); ++ ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask; ++ return; ++ } ++ ++ /* Unbind evicted groups. */ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) { ++ list_for_each_entry(group, &ctx->old_groups[prio], run_node) { ++ /* This group is gone. Process interrupts to clear ++ * any pending interrupts before we start the new ++ * group. ++ */ ++ if (group->csg_id >= 0) ++ sched_process_csg_irq_locked(ptdev, group->csg_id); ++ ++ group_unbind_locked(group); ++ } ++ } ++ ++ for (i = 0; i < sched->csg_slot_count; i++) { ++ if (!sched->csg_slots[i].group) ++ free_csg_slots |= BIT(i); ++ } ++ ++ csgs_upd_ctx_init(&upd_ctx); ++ new_csg_prio = MAX_CSG_PRIO; ++ ++ /* Start new groups. */ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) { ++ list_for_each_entry(group, &ctx->groups[prio], run_node) { ++ int csg_id = group->csg_id; ++ struct panthor_fw_csg_iface *csg_iface; ++ ++ if (csg_id >= 0) { ++ new_csg_prio--; ++ continue; ++ } ++ ++ csg_id = ffs(free_csg_slots) - 1; ++ if (drm_WARN_ON(&ptdev->base, csg_id < 0)) ++ break; ++ ++ csg_iface = panthor_fw_get_csg_iface(ptdev, csg_id); ++ csg_slot = &sched->csg_slots[csg_id]; ++ csg_mod_mask |= BIT(csg_id); ++ group_bind_locked(group, csg_id); ++ csg_slot_prog_locked(ptdev, csg_id, new_csg_prio--); ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id, ++ group->state == PANTHOR_CS_GROUP_SUSPENDED ? ++ CSG_STATE_RESUME : CSG_STATE_START, ++ CSG_STATE_MASK); ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id, ++ csg_iface->output->ack ^ CSG_ENDPOINT_CONFIG, ++ CSG_ENDPOINT_CONFIG); ++ free_csg_slots &= ~BIT(csg_id); ++ } ++ } ++ ++ ret = csgs_upd_ctx_apply_locked(ptdev, &upd_ctx); ++ if (ret) { ++ panthor_device_schedule_reset(ptdev); ++ ctx->csg_upd_failed_mask |= upd_ctx.timedout_mask; ++ return; ++ } ++ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) { ++ list_for_each_entry_safe(group, tmp, &ctx->groups[prio], run_node) { ++ list_del_init(&group->run_node); ++ ++ /* If the group has been destroyed while we were ++ * scheduling, ask for an immediate tick to ++ * re-evaluate as soon as possible and get rid of ++ * this dangling group. ++ */ ++ if (group->destroyed) ++ ctx->immediate_tick = true; ++ group_put(group); ++ } ++ ++ /* Return evicted groups to the idle or run queues. Groups ++ * that can no longer be run (because they've been destroyed ++ * or experienced an unrecoverable error) will be scheduled ++ * for destruction in tick_ctx_cleanup(). ++ */ ++ list_for_each_entry_safe(group, tmp, &ctx->old_groups[prio], run_node) { ++ if (!group_can_run(group)) ++ continue; ++ ++ if (group_is_idle(group)) ++ list_move_tail(&group->run_node, &sched->groups.idle[prio]); ++ else ++ list_move_tail(&group->run_node, &sched->groups.runnable[prio]); ++ group_put(group); ++ } ++ } ++ ++ sched->used_csg_slot_count = ctx->group_count; ++ sched->might_have_idle_groups = ctx->idle_group_count > 0; ++} ++ ++static u64 ++tick_ctx_update_resched_target(struct panthor_scheduler *sched, ++ const struct panthor_sched_tick_ctx *ctx) ++{ ++ /* We had space left, no need to reschedule until some external event happens. */ ++ if (!tick_ctx_is_full(sched, ctx)) ++ goto no_tick; ++ ++ /* If idle groups were scheduled, no need to wake up until some external ++ * event happens (group unblocked, new job submitted, ...). ++ */ ++ if (ctx->idle_group_count) ++ goto no_tick; ++ ++ if (drm_WARN_ON(&sched->ptdev->base, ctx->min_priority >= PANTHOR_CSG_PRIORITY_COUNT)) ++ goto no_tick; ++ ++ /* If there are groups of the same priority waiting, we need to ++ * keep the scheduler ticking, otherwise, we'll just wait for ++ * new groups with higher priority to be queued. ++ */ ++ if (!list_empty(&sched->groups.runnable[ctx->min_priority])) { ++ u64 resched_target = sched->last_tick + sched->tick_period; ++ ++ if (time_before64(sched->resched_target, sched->last_tick) || ++ time_before64(resched_target, sched->resched_target)) ++ sched->resched_target = resched_target; ++ ++ return sched->resched_target - sched->last_tick; ++ } ++ ++no_tick: ++ sched->resched_target = U64_MAX; ++ return U64_MAX; ++} ++ ++static void tick_work(struct work_struct *work) ++{ ++ struct panthor_scheduler *sched = container_of(work, struct panthor_scheduler, ++ tick_work.work); ++ struct panthor_device *ptdev = sched->ptdev; ++ struct panthor_sched_tick_ctx ctx; ++ u64 remaining_jiffies = 0, resched_delay; ++ u64 now = get_jiffies_64(); ++ int prio, ret, cookie; ++ ++ if (!drm_dev_enter(&ptdev->base, &cookie)) ++ return; ++ ++ ret = pm_runtime_resume_and_get(ptdev->base.dev); ++ if (drm_WARN_ON(&ptdev->base, ret)) ++ goto out_dev_exit; ++ ++ if (time_before64(now, sched->resched_target)) ++ remaining_jiffies = sched->resched_target - now; ++ ++ mutex_lock(&sched->lock); ++ if (panthor_device_reset_is_pending(sched->ptdev)) ++ goto out_unlock; ++ ++ tick_ctx_init(sched, &ctx, remaining_jiffies != 0); ++ if (ctx.csg_upd_failed_mask) ++ goto out_cleanup_ctx; ++ ++ if (remaining_jiffies) { ++ /* Scheduling forced in the middle of a tick. Only RT groups ++ * can preempt non-RT ones. Currently running RT groups can't be ++ * preempted. ++ */ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; ++ prio >= 0 && !tick_ctx_is_full(sched, &ctx); ++ prio--) { ++ tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], ++ true, true); ++ if (prio == PANTHOR_CSG_PRIORITY_RT) { ++ tick_ctx_pick_groups_from_list(sched, &ctx, ++ &sched->groups.runnable[prio], ++ true, false); ++ } ++ } ++ } ++ ++ /* First pick non-idle groups */ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; ++ prio >= 0 && !tick_ctx_is_full(sched, &ctx); ++ prio--) { ++ tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.runnable[prio], ++ true, false); ++ tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], true, true); ++ } ++ ++ /* If we have free CSG slots left, pick idle groups */ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; ++ prio >= 0 && !tick_ctx_is_full(sched, &ctx); ++ prio--) { ++ /* Check the old_group queue first to avoid reprogramming the slots */ ++ tick_ctx_pick_groups_from_list(sched, &ctx, &ctx.old_groups[prio], false, true); ++ tick_ctx_pick_groups_from_list(sched, &ctx, &sched->groups.idle[prio], ++ false, false); ++ } ++ ++ tick_ctx_apply(sched, &ctx); ++ if (ctx.csg_upd_failed_mask) ++ goto out_cleanup_ctx; ++ ++ if (ctx.idle_group_count == ctx.group_count) { ++ panthor_devfreq_record_idle(sched->ptdev); ++ if (sched->pm.has_ref) { ++ pm_runtime_put_autosuspend(ptdev->base.dev); ++ sched->pm.has_ref = false; ++ } ++ } else { ++ panthor_devfreq_record_busy(sched->ptdev); ++ if (!sched->pm.has_ref) { ++ pm_runtime_get(ptdev->base.dev); ++ sched->pm.has_ref = true; ++ } ++ } ++ ++ sched->last_tick = now; ++ resched_delay = tick_ctx_update_resched_target(sched, &ctx); ++ if (ctx.immediate_tick) ++ resched_delay = 0; ++ ++ if (resched_delay != U64_MAX) ++ sched_queue_delayed_work(sched, tick, resched_delay); ++ ++out_cleanup_ctx: ++ tick_ctx_cleanup(sched, &ctx); ++ ++out_unlock: ++ mutex_unlock(&sched->lock); ++ pm_runtime_mark_last_busy(ptdev->base.dev); ++ pm_runtime_put_autosuspend(ptdev->base.dev); ++ ++out_dev_exit: ++ drm_dev_exit(cookie); ++} ++ ++static int panthor_queue_eval_syncwait(struct panthor_group *group, u8 queue_idx) ++{ ++ struct panthor_queue *queue = group->queues[queue_idx]; ++ union { ++ struct panthor_syncobj_64b sync64; ++ struct panthor_syncobj_32b sync32; ++ } *syncobj; ++ bool result; ++ u64 value; ++ ++ syncobj = panthor_queue_get_syncwait_obj(group, queue); ++ if (!syncobj) ++ return -EINVAL; ++ ++ value = queue->syncwait.sync64 ? ++ syncobj->sync64.seqno : ++ syncobj->sync32.seqno; ++ ++ if (queue->syncwait.gt) ++ result = value > queue->syncwait.ref; ++ else ++ result = value <= queue->syncwait.ref; ++ ++ if (result) ++ panthor_queue_put_syncwait_obj(queue); ++ ++ return result; ++} ++ ++static void sync_upd_work(struct work_struct *work) ++{ ++ struct panthor_scheduler *sched = container_of(work, ++ struct panthor_scheduler, ++ sync_upd_work); ++ struct panthor_group *group, *tmp; ++ bool immediate_tick = false; ++ ++ mutex_lock(&sched->lock); ++ list_for_each_entry_safe(group, tmp, &sched->groups.waiting, wait_node) { ++ u32 tested_queues = group->blocked_queues; ++ u32 unblocked_queues = 0; ++ ++ while (tested_queues) { ++ u32 cs_id = ffs(tested_queues) - 1; ++ int ret; ++ ++ ret = panthor_queue_eval_syncwait(group, cs_id); ++ drm_WARN_ON(&group->ptdev->base, ret < 0); ++ if (ret) ++ unblocked_queues |= BIT(cs_id); ++ ++ tested_queues &= ~BIT(cs_id); ++ } ++ ++ if (unblocked_queues) { ++ group->blocked_queues &= ~unblocked_queues; ++ ++ if (group->csg_id < 0) { ++ list_move(&group->run_node, ++ &sched->groups.runnable[group->priority]); ++ if (group->priority == PANTHOR_CSG_PRIORITY_RT) ++ immediate_tick = true; ++ } ++ } ++ ++ if (!group->blocked_queues) ++ list_del_init(&group->wait_node); ++ } ++ mutex_unlock(&sched->lock); ++ ++ if (immediate_tick) ++ sched_queue_delayed_work(sched, tick, 0); ++} ++ ++static void group_schedule_locked(struct panthor_group *group, u32 queue_mask) ++{ ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct list_head *queue = &sched->groups.runnable[group->priority]; ++ u64 delay_jiffies = 0; ++ bool was_idle; ++ u64 now; ++ ++ if (!group_can_run(group)) ++ return; ++ ++ /* All updated queues are blocked, no need to wake up the scheduler. */ ++ if ((queue_mask & group->blocked_queues) == queue_mask) ++ return; ++ ++ was_idle = group_is_idle(group); ++ group->idle_queues &= ~queue_mask; ++ ++ /* Don't mess up with the lists if we're in a middle of a reset. */ ++ if (atomic_read(&sched->reset.in_progress)) ++ return; ++ ++ if (was_idle && !group_is_idle(group)) ++ list_move_tail(&group->run_node, queue); ++ ++ /* RT groups are preemptive. */ ++ if (group->priority == PANTHOR_CSG_PRIORITY_RT) { ++ sched_queue_delayed_work(sched, tick, 0); ++ return; ++ } ++ ++ /* Some groups might be idle, force an immediate tick to ++ * re-evaluate. ++ */ ++ if (sched->might_have_idle_groups) { ++ sched_queue_delayed_work(sched, tick, 0); ++ return; ++ } ++ ++ /* Scheduler is ticking, nothing to do. */ ++ if (sched->resched_target != U64_MAX) { ++ /* If there are free slots, force immediating ticking. */ ++ if (sched->used_csg_slot_count < sched->csg_slot_count) ++ sched_queue_delayed_work(sched, tick, 0); ++ ++ return; ++ } ++ ++ /* Scheduler tick was off, recalculate the resched_target based on the ++ * last tick event, and queue the scheduler work. ++ */ ++ now = get_jiffies_64(); ++ sched->resched_target = sched->last_tick + sched->tick_period; ++ if (sched->used_csg_slot_count == sched->csg_slot_count && ++ time_before64(now, sched->resched_target)) ++ delay_jiffies = min_t(unsigned long, sched->resched_target - now, ULONG_MAX); ++ ++ sched_queue_delayed_work(sched, tick, delay_jiffies); ++} ++ ++static void queue_stop(struct panthor_queue *queue, ++ struct panthor_job *bad_job) ++{ ++ drm_sched_stop(&queue->scheduler, bad_job ? &bad_job->base : NULL); ++} ++ ++static void queue_start(struct panthor_queue *queue) ++{ ++ struct panthor_job *job; ++ ++ /* Re-assign the parent fences. */ ++ list_for_each_entry(job, &queue->scheduler.pending_list, base.list) ++ job->base.s_fence->parent = dma_fence_get(job->done_fence); ++ ++ drm_sched_start(&queue->scheduler, true); ++} ++ ++static void panthor_group_stop(struct panthor_group *group) ++{ ++ struct panthor_scheduler *sched = group->ptdev->scheduler; ++ ++ lockdep_assert_held(&sched->reset.lock); ++ ++ for (u32 i = 0; i < group->queue_count; i++) ++ queue_stop(group->queues[i], NULL); ++ ++ group_get(group); ++ list_move_tail(&group->run_node, &sched->reset.stopped_groups); ++} ++ ++static void panthor_group_start(struct panthor_group *group) ++{ ++ struct panthor_scheduler *sched = group->ptdev->scheduler; ++ ++ lockdep_assert_held(&group->ptdev->scheduler->reset.lock); ++ ++ for (u32 i = 0; i < group->queue_count; i++) ++ queue_start(group->queues[i]); ++ ++ if (group_can_run(group)) { ++ list_move_tail(&group->run_node, ++ group_is_idle(group) ? ++ &sched->groups.idle[group->priority] : ++ &sched->groups.runnable[group->priority]); ++ } else { ++ list_del_init(&group->run_node); ++ list_del_init(&group->wait_node); ++ group_queue_work(group, term); ++ } ++ ++ group_put(group); ++} ++ ++static void panthor_sched_immediate_tick(struct panthor_device *ptdev) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ ++ sched_queue_delayed_work(sched, tick, 0); ++} ++ ++/** ++ * panthor_sched_report_mmu_fault() - Report MMU faults to the scheduler. ++ */ ++void panthor_sched_report_mmu_fault(struct panthor_device *ptdev) ++{ ++ /* Force a tick to immediately kill faulty groups. */ ++ if (ptdev->scheduler) ++ panthor_sched_immediate_tick(ptdev); ++} ++ ++void panthor_sched_resume(struct panthor_device *ptdev) ++{ ++ /* Force a tick to re-evaluate after a resume. */ ++ panthor_sched_immediate_tick(ptdev); ++} ++ ++void panthor_sched_suspend(struct panthor_device *ptdev) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_csg_slots_upd_ctx upd_ctx; ++ u64 suspended_slots, faulty_slots; ++ struct panthor_group *group; ++ u32 i; ++ ++ mutex_lock(&sched->lock); ++ csgs_upd_ctx_init(&upd_ctx); ++ for (i = 0; i < sched->csg_slot_count; i++) { ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[i]; ++ ++ if (csg_slot->group) { ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, i, ++ CSG_STATE_SUSPEND, ++ CSG_STATE_MASK); ++ } ++ } ++ ++ suspended_slots = upd_ctx.update_mask; ++ ++ csgs_upd_ctx_apply_locked(ptdev, &upd_ctx); ++ suspended_slots &= ~upd_ctx.timedout_mask; ++ faulty_slots = upd_ctx.timedout_mask; ++ ++ if (faulty_slots) { ++ u32 slot_mask = faulty_slots; ++ ++ drm_err(&ptdev->base, "CSG suspend failed, escalating to termination"); ++ csgs_upd_ctx_init(&upd_ctx); ++ while (slot_mask) { ++ u32 csg_id = ffs(slot_mask) - 1; ++ ++ csgs_upd_ctx_queue_reqs(ptdev, &upd_ctx, csg_id, ++ CSG_STATE_TERMINATE, ++ CSG_STATE_MASK); ++ slot_mask &= ~BIT(csg_id); ++ } ++ ++ csgs_upd_ctx_apply_locked(ptdev, &upd_ctx); ++ ++ slot_mask = upd_ctx.timedout_mask; ++ while (slot_mask) { ++ u32 csg_id = ffs(slot_mask) - 1; ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id]; ++ ++ /* Terminate command timedout, but the soft-reset will ++ * automatically terminate all active groups, so let's ++ * force the state to halted here. ++ */ ++ if (csg_slot->group->state != PANTHOR_CS_GROUP_TERMINATED) ++ csg_slot->group->state = PANTHOR_CS_GROUP_TERMINATED; ++ slot_mask &= ~BIT(csg_id); ++ } ++ } ++ ++ /* Flush L2 and LSC caches to make sure suspend state is up-to-date. ++ * If the flush fails, flag all queues for termination. ++ */ ++ if (suspended_slots) { ++ bool flush_caches_failed = false; ++ u32 slot_mask = suspended_slots; ++ ++ if (panthor_gpu_flush_caches(ptdev, CACHE_CLEAN, CACHE_CLEAN, 0)) ++ flush_caches_failed = true; ++ ++ while (slot_mask) { ++ u32 csg_id = ffs(slot_mask) - 1; ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[csg_id]; ++ ++ if (flush_caches_failed) ++ csg_slot->group->state = PANTHOR_CS_GROUP_TERMINATED; ++ else ++ csg_slot_sync_update_locked(ptdev, csg_id); ++ ++ slot_mask &= ~BIT(csg_id); ++ } ++ ++ if (flush_caches_failed) ++ faulty_slots |= suspended_slots; ++ } ++ ++ for (i = 0; i < sched->csg_slot_count; i++) { ++ struct panthor_csg_slot *csg_slot = &sched->csg_slots[i]; ++ ++ group = csg_slot->group; ++ if (!group) ++ continue; ++ ++ group_get(group); ++ ++ if (group->csg_id >= 0) ++ sched_process_csg_irq_locked(ptdev, group->csg_id); ++ ++ group_unbind_locked(group); ++ ++ drm_WARN_ON(&group->ptdev->base, !list_empty(&group->run_node)); ++ ++ if (group_can_run(group)) { ++ list_add(&group->run_node, ++ &sched->groups.idle[group->priority]); ++ } else { ++ /* We don't bother stopping the scheduler if the group is ++ * faulty, the group termination work will finish the job. ++ */ ++ list_del_init(&group->wait_node); ++ group_queue_work(group, term); ++ } ++ group_put(group); ++ } ++ mutex_unlock(&sched->lock); ++} ++ ++void panthor_sched_pre_reset(struct panthor_device *ptdev) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_group *group, *group_tmp; ++ u32 i; ++ ++ mutex_lock(&sched->reset.lock); ++ atomic_set(&sched->reset.in_progress, true); ++ ++ /* Cancel all scheduler works. Once this is done, these works can't be ++ * scheduled again until the reset operation is complete. ++ */ ++ cancel_work_sync(&sched->sync_upd_work); ++ cancel_delayed_work_sync(&sched->tick_work); ++ ++ panthor_sched_suspend(ptdev); ++ ++ /* Stop all groups that might still accept jobs, so we don't get passed ++ * new jobs while we're resetting. ++ */ ++ for (i = 0; i < ARRAY_SIZE(sched->groups.runnable); i++) { ++ /* All groups should be in the idle lists. */ ++ drm_WARN_ON(&ptdev->base, !list_empty(&sched->groups.runnable[i])); ++ list_for_each_entry_safe(group, group_tmp, &sched->groups.runnable[i], run_node) ++ panthor_group_stop(group); ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(sched->groups.idle); i++) { ++ list_for_each_entry_safe(group, group_tmp, &sched->groups.idle[i], run_node) ++ panthor_group_stop(group); ++ } ++ ++ mutex_unlock(&sched->reset.lock); ++} ++ ++void panthor_sched_post_reset(struct panthor_device *ptdev) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_group *group, *group_tmp; ++ ++ mutex_lock(&sched->reset.lock); ++ ++ list_for_each_entry_safe(group, group_tmp, &sched->reset.stopped_groups, run_node) ++ panthor_group_start(group); ++ ++ /* We're done resetting the GPU, clear the reset.in_progress bit so we can ++ * kick the scheduler. ++ */ ++ atomic_set(&sched->reset.in_progress, false); ++ mutex_unlock(&sched->reset.lock); ++ ++ sched_queue_delayed_work(sched, tick, 0); ++ ++ sched_queue_work(sched, sync_upd); ++} ++ ++static void group_sync_upd_work(struct work_struct *work) ++{ ++ struct panthor_group *group = ++ container_of(work, struct panthor_group, sync_upd_work); ++ struct panthor_job *job, *job_tmp; ++ LIST_HEAD(done_jobs); ++ u32 queue_idx; ++ bool cookie; ++ ++ cookie = dma_fence_begin_signalling(); ++ for (queue_idx = 0; queue_idx < group->queue_count; queue_idx++) { ++ struct panthor_queue *queue = group->queues[queue_idx]; ++ struct panthor_syncobj_64b *syncobj; ++ ++ if (!queue) ++ continue; ++ ++ syncobj = group->syncobjs->kmap + (queue_idx * sizeof(*syncobj)); ++ ++ spin_lock(&queue->fence_ctx.lock); ++ list_for_each_entry_safe(job, job_tmp, &queue->fence_ctx.in_flight_jobs, node) { ++ if (!job->call_info.size) ++ continue; ++ ++ if (syncobj->seqno < job->done_fence->seqno) ++ break; ++ ++ list_move_tail(&job->node, &done_jobs); ++ dma_fence_signal_locked(job->done_fence); ++ } ++ spin_unlock(&queue->fence_ctx.lock); ++ } ++ dma_fence_end_signalling(cookie); ++ ++ list_for_each_entry_safe(job, job_tmp, &done_jobs, node) { ++ list_del_init(&job->node); ++ panthor_job_put(&job->base); ++ } ++ ++ group_put(group); ++} ++ ++static struct dma_fence * ++queue_run_job(struct drm_sched_job *sched_job) ++{ ++ struct panthor_job *job = container_of(sched_job, struct panthor_job, base); ++ struct panthor_group *group = job->group; ++ struct panthor_queue *queue = group->queues[job->queue_idx]; ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ u32 ringbuf_size = panthor_kernel_bo_size(queue->ringbuf); ++ u32 ringbuf_insert = queue->iface.input->insert & (ringbuf_size - 1); ++ u64 addr_reg = ptdev->csif_info.cs_reg_count - ++ ptdev->csif_info.unpreserved_cs_reg_count; ++ u64 val_reg = addr_reg + 2; ++ u64 sync_addr = panthor_kernel_bo_gpuva(group->syncobjs) + ++ job->queue_idx * sizeof(struct panthor_syncobj_64b); ++ u32 waitall_mask = GENMASK(sched->sb_slot_count - 1, 0); ++ struct dma_fence *done_fence; ++ int ret; ++ ++ u64 call_instrs[NUM_INSTRS_PER_SLOT] = { ++ /* MOV32 rX+2, cs.latest_flush */ ++ (2ull << 56) | (val_reg << 48) | job->call_info.latest_flush, ++ ++ /* FLUSH_CACHE2.clean_inv_all.no_wait.signal(0) rX+2 */ ++ (36ull << 56) | (0ull << 48) | (val_reg << 40) | (0 << 16) | 0x233, ++ ++ /* MOV48 rX:rX+1, cs.start */ ++ (1ull << 56) | (addr_reg << 48) | job->call_info.start, ++ ++ /* MOV32 rX+2, cs.size */ ++ (2ull << 56) | (val_reg << 48) | job->call_info.size, ++ ++ /* WAIT(0) => waits for FLUSH_CACHE2 instruction */ ++ (3ull << 56) | (1 << 16), ++ ++ /* CALL rX:rX+1, rX+2 */ ++ (32ull << 56) | (addr_reg << 40) | (val_reg << 32), ++ ++ /* MOV48 rX:rX+1, sync_addr */ ++ (1ull << 56) | (addr_reg << 48) | sync_addr, ++ ++ /* MOV48 rX+2, #1 */ ++ (1ull << 56) | (val_reg << 48) | 1, ++ ++ /* WAIT(all) */ ++ (3ull << 56) | (waitall_mask << 16), ++ ++ /* SYNC_ADD64.system_scope.propage_err.nowait rX:rX+1, rX+2*/ ++ (51ull << 56) | (0ull << 48) | (addr_reg << 40) | (val_reg << 32) | (0 << 16) | 1, ++ ++ /* ERROR_BARRIER, so we can recover from faults at job ++ * boundaries. ++ */ ++ (47ull << 56), ++ }; ++ ++ /* Need to be cacheline aligned to please the prefetcher. */ ++ static_assert(sizeof(call_instrs) % 64 == 0, ++ "call_instrs is not aligned on a cacheline"); ++ ++ /* Stream size is zero, nothing to do => return a NULL fence and let ++ * drm_sched signal the parent. ++ */ ++ if (!job->call_info.size) ++ return NULL; ++ ++ ret = pm_runtime_resume_and_get(ptdev->base.dev); ++ if (drm_WARN_ON(&ptdev->base, ret)) ++ return ERR_PTR(ret); ++ ++ mutex_lock(&sched->lock); ++ if (!group_can_run(group)) { ++ done_fence = ERR_PTR(-ECANCELED); ++ goto out_unlock; ++ } ++ ++ dma_fence_init(job->done_fence, ++ &panthor_queue_fence_ops, ++ &queue->fence_ctx.lock, ++ queue->fence_ctx.id, ++ atomic64_inc_return(&queue->fence_ctx.seqno)); ++ ++ memcpy(queue->ringbuf->kmap + ringbuf_insert, ++ call_instrs, sizeof(call_instrs)); ++ ++ panthor_job_get(&job->base); ++ spin_lock(&queue->fence_ctx.lock); ++ list_add_tail(&job->node, &queue->fence_ctx.in_flight_jobs); ++ spin_unlock(&queue->fence_ctx.lock); ++ ++ job->ringbuf.start = queue->iface.input->insert; ++ job->ringbuf.end = job->ringbuf.start + sizeof(call_instrs); ++ ++ /* Make sure the ring buffer is updated before the INSERT ++ * register. ++ */ ++ wmb(); ++ ++ queue->iface.input->extract = queue->iface.output->extract; ++ queue->iface.input->insert = job->ringbuf.end; ++ ++ if (group->csg_id < 0) { ++ /* If the queue is blocked, we want to keep the timeout running, so we ++ * can detect unbounded waits and kill the group when that happens. ++ * Otherwise, we suspend the timeout so the time we spend waiting for ++ * a CSG slot is not counted. ++ */ ++ if (!(group->blocked_queues & BIT(job->queue_idx)) && ++ !queue->timeout_suspended) { ++ queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler); ++ queue->timeout_suspended = true; ++ } ++ ++ group_schedule_locked(group, BIT(job->queue_idx)); ++ } else { ++ gpu_write(ptdev, CSF_DOORBELL(queue->doorbell_id), 1); ++ if (!sched->pm.has_ref && ++ !(group->blocked_queues & BIT(job->queue_idx))) { ++ pm_runtime_get(ptdev->base.dev); ++ sched->pm.has_ref = true; ++ } ++ } ++ ++ done_fence = dma_fence_get(job->done_fence); ++ ++out_unlock: ++ mutex_unlock(&sched->lock); ++ pm_runtime_mark_last_busy(ptdev->base.dev); ++ pm_runtime_put_autosuspend(ptdev->base.dev); ++ ++ return done_fence; ++} ++ ++static enum drm_gpu_sched_stat ++queue_timedout_job(struct drm_sched_job *sched_job) ++{ ++ struct panthor_job *job = container_of(sched_job, struct panthor_job, base); ++ struct panthor_group *group = job->group; ++ struct panthor_device *ptdev = group->ptdev; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_queue *queue = group->queues[job->queue_idx]; ++ ++ drm_warn(&ptdev->base, "job timeout\n"); ++ ++ drm_WARN_ON(&ptdev->base, atomic_read(&sched->reset.in_progress)); ++ ++ queue_stop(queue, job); ++ ++ mutex_lock(&sched->lock); ++ group->timedout = true; ++ if (group->csg_id >= 0) { ++ sched_queue_delayed_work(ptdev->scheduler, tick, 0); ++ } else { ++ /* Remove from the run queues, so the scheduler can't ++ * pick the group on the next tick. ++ */ ++ list_del_init(&group->run_node); ++ list_del_init(&group->wait_node); ++ ++ group_queue_work(group, term); ++ } ++ mutex_unlock(&sched->lock); ++ ++ queue_start(queue); ++ ++ return DRM_GPU_SCHED_STAT_NOMINAL; ++} ++ ++static void queue_free_job(struct drm_sched_job *sched_job) ++{ ++ drm_sched_job_cleanup(sched_job); ++ panthor_job_put(sched_job); ++} ++ ++static const struct drm_sched_backend_ops panthor_queue_sched_ops = { ++ .run_job = queue_run_job, ++ .timedout_job = queue_timedout_job, ++ .free_job = queue_free_job, ++}; ++ ++static struct panthor_queue * ++group_create_queue(struct panthor_group *group, ++ const struct drm_panthor_queue_create *args) ++{ ++ struct drm_gpu_scheduler *drm_sched; ++ struct panthor_queue *queue; ++ int ret; ++ ++ if (args->pad[0] || args->pad[1] || args->pad[2]) ++ return ERR_PTR(-EINVAL); ++ ++ if (args->ringbuf_size < SZ_4K || args->ringbuf_size > SZ_64K || ++ !is_power_of_2(args->ringbuf_size)) ++ return ERR_PTR(-EINVAL); ++ ++ if (args->priority > CSF_MAX_QUEUE_PRIO) ++ return ERR_PTR(-EINVAL); ++ ++ queue = kzalloc(sizeof(*queue), GFP_KERNEL); ++ if (!queue) ++ return ERR_PTR(-ENOMEM); ++ ++ queue->fence_ctx.id = dma_fence_context_alloc(1); ++ spin_lock_init(&queue->fence_ctx.lock); ++ INIT_LIST_HEAD(&queue->fence_ctx.in_flight_jobs); ++ ++ queue->priority = args->priority; ++ ++ queue->ringbuf = panthor_kernel_bo_create(group->ptdev, group->vm, ++ args->ringbuf_size, ++ DRM_PANTHOR_BO_NO_MMAP, ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | ++ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, ++ PANTHOR_VM_KERNEL_AUTO_VA); ++ if (IS_ERR(queue->ringbuf)) { ++ ret = PTR_ERR(queue->ringbuf); ++ goto err_free_queue; ++ } ++ ++ ret = panthor_kernel_bo_vmap(queue->ringbuf); ++ if (ret) ++ goto err_free_queue; ++ ++ queue->iface.mem = panthor_fw_alloc_queue_iface_mem(group->ptdev, ++ &queue->iface.input, ++ &queue->iface.output, ++ &queue->iface.input_fw_va, ++ &queue->iface.output_fw_va); ++ if (IS_ERR(queue->iface.mem)) { ++ ret = PTR_ERR(queue->iface.mem); ++ goto err_free_queue; ++ } ++ ++ ret = drm_sched_init(&queue->scheduler, &panthor_queue_sched_ops, ++ group->ptdev->scheduler->wq, 1, ++ args->ringbuf_size / (NUM_INSTRS_PER_SLOT * sizeof(u64)), ++ 0, msecs_to_jiffies(JOB_TIMEOUT_MS), ++ group->ptdev->reset.wq, ++ NULL, "panthor-queue", group->ptdev->base.dev); ++ if (ret) ++ goto err_free_queue; ++ ++ drm_sched = &queue->scheduler; ++ ret = drm_sched_entity_init(&queue->entity, 0, &drm_sched, 1, NULL); ++ ++ return queue; ++ ++err_free_queue: ++ group_free_queue(group, queue); ++ return ERR_PTR(ret); ++} ++ ++#define MAX_GROUPS_PER_POOL 128 ++ ++int panthor_group_create(struct panthor_file *pfile, ++ const struct drm_panthor_group_create *group_args, ++ const struct drm_panthor_queue_create *queue_args) ++{ ++ struct panthor_device *ptdev = pfile->ptdev; ++ struct panthor_group_pool *gpool = pfile->groups; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0); ++ struct panthor_group *group = NULL; ++ u32 gid, i, suspend_size; ++ int ret; ++ ++ if (group_args->pad) ++ return -EINVAL; ++ ++ if (group_args->priority > PANTHOR_CSG_PRIORITY_HIGH) ++ return -EINVAL; ++ ++ if ((group_args->compute_core_mask & ~ptdev->gpu_info.shader_present) || ++ (group_args->fragment_core_mask & ~ptdev->gpu_info.shader_present) || ++ (group_args->tiler_core_mask & ~ptdev->gpu_info.tiler_present)) ++ return -EINVAL; ++ ++ if (hweight64(group_args->compute_core_mask) < group_args->max_compute_cores || ++ hweight64(group_args->fragment_core_mask) < group_args->max_fragment_cores || ++ hweight64(group_args->tiler_core_mask) < group_args->max_tiler_cores) ++ return -EINVAL; ++ ++ group = kzalloc(sizeof(*group), GFP_KERNEL); ++ if (!group) ++ return -ENOMEM; ++ ++ spin_lock_init(&group->fatal_lock); ++ kref_init(&group->refcount); ++ group->state = PANTHOR_CS_GROUP_CREATED; ++ group->csg_id = -1; ++ ++ group->ptdev = ptdev; ++ group->max_compute_cores = group_args->max_compute_cores; ++ group->compute_core_mask = group_args->compute_core_mask; ++ group->max_fragment_cores = group_args->max_fragment_cores; ++ group->fragment_core_mask = group_args->fragment_core_mask; ++ group->max_tiler_cores = group_args->max_tiler_cores; ++ group->tiler_core_mask = group_args->tiler_core_mask; ++ group->priority = group_args->priority; ++ ++ INIT_LIST_HEAD(&group->wait_node); ++ INIT_LIST_HEAD(&group->run_node); ++ INIT_WORK(&group->term_work, group_term_work); ++ INIT_WORK(&group->sync_upd_work, group_sync_upd_work); ++ INIT_WORK(&group->tiler_oom_work, group_tiler_oom_work); ++ INIT_WORK(&group->release_work, group_release_work); ++ ++ group->vm = panthor_vm_pool_get_vm(pfile->vms, group_args->vm_id); ++ if (!group->vm) { ++ ret = -EINVAL; ++ goto err_put_group; ++ } ++ ++ suspend_size = csg_iface->control->suspend_size; ++ group->suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size); ++ if (IS_ERR(group->suspend_buf)) { ++ ret = PTR_ERR(group->suspend_buf); ++ group->suspend_buf = NULL; ++ goto err_put_group; ++ } ++ ++ suspend_size = csg_iface->control->protm_suspend_size; ++ group->protm_suspend_buf = panthor_fw_alloc_suspend_buf_mem(ptdev, suspend_size); ++ if (IS_ERR(group->protm_suspend_buf)) { ++ ret = PTR_ERR(group->protm_suspend_buf); ++ group->protm_suspend_buf = NULL; ++ goto err_put_group; ++ } ++ ++ group->syncobjs = panthor_kernel_bo_create(ptdev, group->vm, ++ group_args->queues.count * ++ sizeof(struct panthor_syncobj_64b), ++ DRM_PANTHOR_BO_NO_MMAP, ++ DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC | ++ DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED, ++ PANTHOR_VM_KERNEL_AUTO_VA); ++ if (IS_ERR(group->syncobjs)) { ++ ret = PTR_ERR(group->syncobjs); ++ goto err_put_group; ++ } ++ ++ ret = panthor_kernel_bo_vmap(group->syncobjs); ++ if (ret) ++ goto err_put_group; ++ ++ memset(group->syncobjs->kmap, 0, ++ group_args->queues.count * sizeof(struct panthor_syncobj_64b)); ++ ++ for (i = 0; i < group_args->queues.count; i++) { ++ group->queues[i] = group_create_queue(group, &queue_args[i]); ++ if (IS_ERR(group->queues[i])) { ++ ret = PTR_ERR(group->queues[i]); ++ group->queues[i] = NULL; ++ goto err_put_group; ++ } ++ ++ group->queue_count++; ++ } ++ ++ group->idle_queues = GENMASK(group->queue_count - 1, 0); ++ ++ ret = xa_alloc(&gpool->xa, &gid, group, XA_LIMIT(1, MAX_GROUPS_PER_POOL), GFP_KERNEL); ++ if (ret) ++ goto err_put_group; ++ ++ mutex_lock(&sched->reset.lock); ++ if (atomic_read(&sched->reset.in_progress)) { ++ panthor_group_stop(group); ++ } else { ++ mutex_lock(&sched->lock); ++ list_add_tail(&group->run_node, ++ &sched->groups.idle[group->priority]); ++ mutex_unlock(&sched->lock); ++ } ++ mutex_unlock(&sched->reset.lock); ++ ++ return gid; ++ ++err_put_group: ++ group_put(group); ++ return ret; ++} ++ ++int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle) ++{ ++ struct panthor_group_pool *gpool = pfile->groups; ++ struct panthor_device *ptdev = pfile->ptdev; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_group *group; ++ ++ group = xa_erase(&gpool->xa, group_handle); ++ if (!group) ++ return -EINVAL; ++ ++ for (u32 i = 0; i < group->queue_count; i++) { ++ if (group->queues[i]) ++ drm_sched_entity_destroy(&group->queues[i]->entity); ++ } ++ ++ mutex_lock(&sched->reset.lock); ++ mutex_lock(&sched->lock); ++ group->destroyed = true; ++ if (group->csg_id >= 0) { ++ sched_queue_delayed_work(sched, tick, 0); ++ } else if (!atomic_read(&sched->reset.in_progress)) { ++ /* Remove from the run queues, so the scheduler can't ++ * pick the group on the next tick. ++ */ ++ list_del_init(&group->run_node); ++ list_del_init(&group->wait_node); ++ group_queue_work(group, term); ++ } ++ mutex_unlock(&sched->lock); ++ mutex_unlock(&sched->reset.lock); ++ ++ group_put(group); ++ return 0; ++} ++ ++int panthor_group_get_state(struct panthor_file *pfile, ++ struct drm_panthor_group_get_state *get_state) ++{ ++ struct panthor_group_pool *gpool = pfile->groups; ++ struct panthor_device *ptdev = pfile->ptdev; ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ struct panthor_group *group; ++ ++ if (get_state->pad) ++ return -EINVAL; ++ ++ group = group_get(xa_load(&gpool->xa, get_state->group_handle)); ++ if (!group) ++ return -EINVAL; ++ ++ memset(get_state, 0, sizeof(*get_state)); ++ ++ mutex_lock(&sched->lock); ++ if (group->timedout) ++ get_state->state |= DRM_PANTHOR_GROUP_STATE_TIMEDOUT; ++ if (group->fatal_queues) { ++ get_state->state |= DRM_PANTHOR_GROUP_STATE_FATAL_FAULT; ++ get_state->fatal_queues = group->fatal_queues; ++ } ++ mutex_unlock(&sched->lock); ++ ++ group_put(group); ++ return 0; ++} ++ ++int panthor_group_pool_create(struct panthor_file *pfile) ++{ ++ struct panthor_group_pool *gpool; ++ ++ gpool = kzalloc(sizeof(*gpool), GFP_KERNEL); ++ if (!gpool) ++ return -ENOMEM; ++ ++ xa_init_flags(&gpool->xa, XA_FLAGS_ALLOC1); ++ pfile->groups = gpool; ++ return 0; ++} ++ ++void panthor_group_pool_destroy(struct panthor_file *pfile) ++{ ++ struct panthor_group_pool *gpool = pfile->groups; ++ struct panthor_group *group; ++ unsigned long i; ++ ++ if (IS_ERR_OR_NULL(gpool)) ++ return; ++ ++ xa_for_each(&gpool->xa, i, group) ++ panthor_group_destroy(pfile, i); ++ ++ xa_destroy(&gpool->xa); ++ kfree(gpool); ++ pfile->groups = NULL; ++} ++ ++static void job_release(struct kref *ref) ++{ ++ struct panthor_job *job = container_of(ref, struct panthor_job, refcount); ++ ++ drm_WARN_ON(&job->group->ptdev->base, !list_empty(&job->node)); ++ ++ if (job->base.s_fence) ++ drm_sched_job_cleanup(&job->base); ++ ++ if (job->done_fence && job->done_fence->ops) ++ dma_fence_put(job->done_fence); ++ else ++ dma_fence_free(job->done_fence); ++ ++ group_put(job->group); ++ ++ kfree(job); ++} ++ ++struct drm_sched_job *panthor_job_get(struct drm_sched_job *sched_job) ++{ ++ if (sched_job) { ++ struct panthor_job *job = container_of(sched_job, struct panthor_job, base); ++ ++ kref_get(&job->refcount); ++ } ++ ++ return sched_job; ++} ++ ++void panthor_job_put(struct drm_sched_job *sched_job) ++{ ++ struct panthor_job *job = container_of(sched_job, struct panthor_job, base); ++ ++ if (sched_job) ++ kref_put(&job->refcount, job_release); ++} ++ ++struct panthor_vm *panthor_job_vm(struct drm_sched_job *sched_job) ++{ ++ struct panthor_job *job = container_of(sched_job, struct panthor_job, base); ++ ++ return job->group->vm; ++} ++ ++struct drm_sched_job * ++panthor_job_create(struct panthor_file *pfile, ++ u16 group_handle, ++ const struct drm_panthor_queue_submit *qsubmit) ++{ ++ struct panthor_group_pool *gpool = pfile->groups; ++ struct panthor_job *job; ++ int ret; ++ ++ if (qsubmit->pad) ++ return ERR_PTR(-EINVAL); ++ ++ /* If stream_addr is zero, so stream_size should be. */ ++ if ((qsubmit->stream_size == 0) != (qsubmit->stream_addr == 0)) ++ return ERR_PTR(-EINVAL); ++ ++ /* Make sure the address is aligned on 64-byte (cacheline) and the size is ++ * aligned on 8-byte (instruction size). ++ */ ++ if ((qsubmit->stream_addr & 63) || (qsubmit->stream_size & 7)) ++ return ERR_PTR(-EINVAL); ++ ++ /* bits 24:30 must be zero. */ ++ if (qsubmit->latest_flush & GENMASK(30, 24)) ++ return ERR_PTR(-EINVAL); ++ ++ job = kzalloc(sizeof(*job), GFP_KERNEL); ++ if (!job) ++ return ERR_PTR(-ENOMEM); ++ ++ kref_init(&job->refcount); ++ job->queue_idx = qsubmit->queue_index; ++ job->call_info.size = qsubmit->stream_size; ++ job->call_info.start = qsubmit->stream_addr; ++ job->call_info.latest_flush = qsubmit->latest_flush; ++ INIT_LIST_HEAD(&job->node); ++ ++ job->group = group_get(xa_load(&gpool->xa, group_handle)); ++ if (!job->group) { ++ ret = -EINVAL; ++ goto err_put_job; ++ } ++ ++ if (job->queue_idx >= job->group->queue_count || ++ !job->group->queues[job->queue_idx]) { ++ ret = -EINVAL; ++ goto err_put_job; ++ } ++ ++ job->done_fence = kzalloc(sizeof(*job->done_fence), GFP_KERNEL); ++ if (!job->done_fence) { ++ ret = -ENOMEM; ++ goto err_put_job; ++ } ++ ++ ret = drm_sched_job_init(&job->base, ++ &job->group->queues[job->queue_idx]->entity, ++ 1, job->group); ++ if (ret) ++ goto err_put_job; ++ ++ return &job->base; ++ ++err_put_job: ++ panthor_job_put(&job->base); ++ return ERR_PTR(ret); ++} ++ ++void panthor_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *sched_job) ++{ ++ struct panthor_job *job = container_of(sched_job, struct panthor_job, base); ++ ++ /* Still not sure why we want USAGE_WRITE for external objects, since I ++ * was assuming this would be handled through explicit syncs being imported ++ * to external BOs with DMA_BUF_IOCTL_IMPORT_SYNC_FILE, but other drivers ++ * seem to pass DMA_RESV_USAGE_WRITE, so there must be a good reason. ++ */ ++ panthor_vm_update_resvs(job->group->vm, exec, &sched_job->s_fence->finished, ++ DMA_RESV_USAGE_BOOKKEEP, DMA_RESV_USAGE_WRITE); ++} ++ ++void panthor_sched_unplug(struct panthor_device *ptdev) ++{ ++ struct panthor_scheduler *sched = ptdev->scheduler; ++ ++ cancel_delayed_work_sync(&sched->tick_work); ++ ++ mutex_lock(&sched->lock); ++ if (sched->pm.has_ref) { ++ pm_runtime_put(ptdev->base.dev); ++ sched->pm.has_ref = false; ++ } ++ mutex_unlock(&sched->lock); ++} ++ ++static void panthor_sched_fini(struct drm_device *ddev, void *res) ++{ ++ struct panthor_scheduler *sched = res; ++ int prio; ++ ++ if (!sched || !sched->csg_slot_count) ++ return; ++ ++ cancel_delayed_work_sync(&sched->tick_work); ++ ++ if (sched->wq) ++ destroy_workqueue(sched->wq); ++ ++ if (sched->heap_alloc_wq) ++ destroy_workqueue(sched->heap_alloc_wq); ++ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) { ++ drm_WARN_ON(ddev, !list_empty(&sched->groups.runnable[prio])); ++ drm_WARN_ON(ddev, !list_empty(&sched->groups.idle[prio])); ++ } ++ ++ drm_WARN_ON(ddev, !list_empty(&sched->groups.waiting)); ++} ++ ++int panthor_sched_init(struct panthor_device *ptdev) ++{ ++ struct panthor_fw_global_iface *glb_iface = panthor_fw_get_glb_iface(ptdev); ++ struct panthor_fw_csg_iface *csg_iface = panthor_fw_get_csg_iface(ptdev, 0); ++ struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, 0, 0); ++ struct panthor_scheduler *sched; ++ u32 gpu_as_count, num_groups; ++ int prio, ret; ++ ++ sched = drmm_kzalloc(&ptdev->base, sizeof(*sched), GFP_KERNEL); ++ if (!sched) ++ return -ENOMEM; ++ ++ /* The highest bit in JOB_INT_* is reserved for globabl IRQs. That ++ * leaves 31 bits for CSG IRQs, hence the MAX_CSGS clamp here. ++ */ ++ num_groups = min_t(u32, MAX_CSGS, glb_iface->control->group_num); ++ ++ /* The FW-side scheduler might deadlock if two groups with the same ++ * priority try to access a set of resources that overlaps, with part ++ * of the resources being allocated to one group and the other part to ++ * the other group, both groups waiting for the remaining resources to ++ * be allocated. To avoid that, it is recommended to assign each CSG a ++ * different priority. In theory we could allow several groups to have ++ * the same CSG priority if they don't request the same resources, but ++ * that makes the scheduling logic more complicated, so let's clamp ++ * the number of CSG slots to MAX_CSG_PRIO + 1 for now. ++ */ ++ num_groups = min_t(u32, MAX_CSG_PRIO + 1, num_groups); ++ ++ /* We need at least one AS for the MCU and one for the GPU contexts. */ ++ gpu_as_count = hweight32(ptdev->gpu_info.as_present & GENMASK(31, 1)); ++ if (!gpu_as_count) { ++ drm_err(&ptdev->base, "Not enough AS (%d, expected at least 2)", ++ gpu_as_count + 1); ++ return -EINVAL; ++ } ++ ++ sched->ptdev = ptdev; ++ sched->sb_slot_count = CS_FEATURES_SCOREBOARDS(cs_iface->control->features); ++ sched->csg_slot_count = num_groups; ++ sched->cs_slot_count = csg_iface->control->stream_num; ++ sched->as_slot_count = gpu_as_count; ++ ptdev->csif_info.csg_slot_count = sched->csg_slot_count; ++ ptdev->csif_info.cs_slot_count = sched->cs_slot_count; ++ ptdev->csif_info.scoreboard_slot_count = sched->sb_slot_count; ++ ++ sched->last_tick = 0; ++ sched->resched_target = U64_MAX; ++ sched->tick_period = msecs_to_jiffies(10); ++ INIT_DELAYED_WORK(&sched->tick_work, tick_work); ++ INIT_WORK(&sched->sync_upd_work, sync_upd_work); ++ INIT_WORK(&sched->fw_events_work, process_fw_events_work); ++ ++ ret = drmm_mutex_init(&ptdev->base, &sched->lock); ++ if (ret) ++ return ret; ++ ++ for (prio = PANTHOR_CSG_PRIORITY_COUNT - 1; prio >= 0; prio--) { ++ INIT_LIST_HEAD(&sched->groups.runnable[prio]); ++ INIT_LIST_HEAD(&sched->groups.idle[prio]); ++ } ++ INIT_LIST_HEAD(&sched->groups.waiting); ++ ++ ret = drmm_mutex_init(&ptdev->base, &sched->reset.lock); ++ if (ret) ++ return ret; ++ ++ INIT_LIST_HEAD(&sched->reset.stopped_groups); ++ ++ /* sched->heap_alloc_wq will be used for heap chunk allocation on ++ * tiler OOM events, which means we can't use the same workqueue for ++ * the scheduler because works queued by the scheduler are in ++ * the dma-signalling path. Allocate a dedicated heap_alloc_wq to ++ * work around this limitation. ++ * ++ * FIXME: Ultimately, what we need is a failable/non-blocking GEM ++ * allocation path that we can call when a heap OOM is reported. The ++ * FW is smart enough to fall back on other methods if the kernel can't ++ * allocate memory, and fail the tiling job if none of these ++ * countermeasures worked. ++ * ++ * Set WQ_MEM_RECLAIM on sched->wq to unblock the situation when the ++ * system is running out of memory. ++ */ ++ sched->heap_alloc_wq = alloc_workqueue("panthor-heap-alloc", WQ_UNBOUND, 0); ++ sched->wq = alloc_workqueue("panthor-csf-sched", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); ++ if (!sched->wq || !sched->heap_alloc_wq) { ++ panthor_sched_fini(&ptdev->base, sched); ++ drm_err(&ptdev->base, "Failed to allocate the workqueues"); ++ return -ENOMEM; ++ } ++ ++ ret = drmm_add_action_or_reset(&ptdev->base, panthor_sched_fini, sched); ++ if (ret) ++ return ret; ++ ++ ptdev->scheduler = sched; ++ return 0; ++} +diff --git a/drivers/gpu/drm/panthor/panthor_sched.h b/drivers/gpu/drm/panthor/panthor_sched.h +new file mode 100644 +index 000000000000..66438b1f331f +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_sched.h +@@ -0,0 +1,50 @@ ++/* SPDX-License-Identifier: GPL-2.0 or MIT */ ++/* Copyright 2023 Collabora ltd. */ ++ ++#ifndef __PANTHOR_SCHED_H__ ++#define __PANTHOR_SCHED_H__ ++ ++struct drm_exec; ++struct dma_fence; ++struct drm_file; ++struct drm_gem_object; ++struct drm_sched_job; ++struct drm_panthor_group_create; ++struct drm_panthor_queue_create; ++struct drm_panthor_group_get_state; ++struct drm_panthor_queue_submit; ++struct panthor_device; ++struct panthor_file; ++struct panthor_group_pool; ++struct panthor_job; ++ ++int panthor_group_create(struct panthor_file *pfile, ++ const struct drm_panthor_group_create *group_args, ++ const struct drm_panthor_queue_create *queue_args); ++int panthor_group_destroy(struct panthor_file *pfile, u32 group_handle); ++int panthor_group_get_state(struct panthor_file *pfile, ++ struct drm_panthor_group_get_state *get_state); ++ ++struct drm_sched_job * ++panthor_job_create(struct panthor_file *pfile, ++ u16 group_handle, ++ const struct drm_panthor_queue_submit *qsubmit); ++struct drm_sched_job *panthor_job_get(struct drm_sched_job *job); ++struct panthor_vm *panthor_job_vm(struct drm_sched_job *sched_job); ++void panthor_job_put(struct drm_sched_job *job); ++void panthor_job_update_resvs(struct drm_exec *exec, struct drm_sched_job *job); ++ ++int panthor_group_pool_create(struct panthor_file *pfile); ++void panthor_group_pool_destroy(struct panthor_file *pfile); ++ ++int panthor_sched_init(struct panthor_device *ptdev); ++void panthor_sched_unplug(struct panthor_device *ptdev); ++void panthor_sched_pre_reset(struct panthor_device *ptdev); ++void panthor_sched_post_reset(struct panthor_device *ptdev); ++void panthor_sched_suspend(struct panthor_device *ptdev); ++void panthor_sched_resume(struct panthor_device *ptdev); ++ ++void panthor_sched_report_mmu_fault(struct panthor_device *ptdev); ++void panthor_sched_report_fw_events(struct panthor_device *ptdev, u32 events); ++ ++#endif +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:25 +0100 +Subject: drm/panthor: Add the driver frontend block + +This is the last piece missing to expose the driver to the outside +world. + +This is basically a wrapper between the ioctls and the other logical +blocks. + +v6: +- Add Maxime's and Heiko's acks +- Return a page-aligned BO size to userspace +- Keep header inclusion alphabetically ordered + +v5: +- Account for the drm_exec_init() prototype change +- Include platform_device.h + +v4: +- Add an ioctl to let the UMD query the VM state +- Fix kernel doc +- Let panthor_device_init() call panthor_device_init() +- Fix cleanup ordering in the panthor_init() error path +- Add Steve's and Liviu's R-b + +v3: +- Add acks for the MIT/GPL2 relicensing +- Fix 32-bit support +- Account for panthor_vm and panthor_sched changes +- Simplify the resv preparation/update logic +- Use a linked list rather than xarray for list of signals. +- Simplify panthor_get_uobj_array by returning the newly allocated + array. +- Drop the "DOC" for job submission helpers and move the relevant + comments to panthor_ioctl_group_submit(). +- Add helpers sync_op_is_signal()/sync_op_is_wait(). +- Simplify return type of panthor_submit_ctx_add_sync_signal() and + panthor_submit_ctx_get_sync_signal(). +- Drop WARN_ON from panthor_submit_ctx_add_job(). +- Fix typos in comments. + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Steven Price +Reviewed-by: Liviu Dudau +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/panthor/panthor_drv.c | 1473 ++++++++++ + 1 file changed, 1473 insertions(+) + +diff --git a/drivers/gpu/drm/panthor/panthor_drv.c b/drivers/gpu/drm/panthor/panthor_drv.c +new file mode 100644 +index 000000000000..ff484506229f +--- /dev/null ++++ b/drivers/gpu/drm/panthor/panthor_drv.c +@@ -0,0 +1,1473 @@ ++// SPDX-License-Identifier: GPL-2.0 or MIT ++/* Copyright 2018 Marty E. Plummer */ ++/* Copyright 2019 Linaro, Ltd., Rob Herring */ ++/* Copyright 2019 Collabora ltd. */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "panthor_device.h" ++#include "panthor_fw.h" ++#include "panthor_gem.h" ++#include "panthor_gpu.h" ++#include "panthor_heap.h" ++#include "panthor_mmu.h" ++#include "panthor_regs.h" ++#include "panthor_sched.h" ++ ++/** ++ * DOC: user <-> kernel object copy helpers. ++ */ ++ ++/** ++ * panthor_set_uobj() - Copy kernel object to user object. ++ * @usr_ptr: Users pointer. ++ * @usr_size: Size of the user object. ++ * @min_size: Minimum size for this object. ++ * @kern_size: Size of the kernel object. ++ * @in: Address of the kernel object to copy. ++ * ++ * Helper automating kernel -> user object copies. ++ * ++ * Don't use this function directly, use PANTHOR_UOBJ_SET() instead. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_set_uobj(u64 usr_ptr, u32 usr_size, u32 min_size, u32 kern_size, const void *in) ++{ ++ /* User size shouldn't be smaller than the minimal object size. */ ++ if (usr_size < min_size) ++ return -EINVAL; ++ ++ if (copy_to_user(u64_to_user_ptr(usr_ptr), in, min_t(u32, usr_size, kern_size))) ++ return -EFAULT; ++ ++ /* When the kernel object is smaller than the user object, we fill the gap with ++ * zeros. ++ */ ++ if (usr_size > kern_size && ++ clear_user(u64_to_user_ptr(usr_ptr + kern_size), usr_size - kern_size)) { ++ return -EFAULT; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_get_uobj_array() - Copy a user object array into a kernel accessible object array. ++ * @in: The object array to copy. ++ * @min_stride: Minimum array stride. ++ * @obj_size: Kernel object size. ++ * ++ * Helper automating user -> kernel object copies. ++ * ++ * Don't use this function directly, use PANTHOR_UOBJ_GET_ARRAY() instead. ++ * ++ * Return: newly allocated object array or an ERR_PTR on error. ++ */ ++static void * ++panthor_get_uobj_array(const struct drm_panthor_obj_array *in, u32 min_stride, ++ u32 obj_size) ++{ ++ int ret = 0; ++ void *out_alloc; ++ ++ /* User stride must be at least the minimum object size, otherwise it might ++ * lack useful information. ++ */ ++ if (in->stride < min_stride) ++ return ERR_PTR(-EINVAL); ++ ++ if (!in->count) ++ return NULL; ++ ++ out_alloc = kvmalloc_array(in->count, obj_size, GFP_KERNEL); ++ if (!out_alloc) ++ return ERR_PTR(-ENOMEM); ++ ++ if (obj_size == in->stride) { ++ /* Fast path when user/kernel have the same uAPI header version. */ ++ if (copy_from_user(out_alloc, u64_to_user_ptr(in->array), ++ (unsigned long)obj_size * in->count)) ++ ret = -EFAULT; ++ } else { ++ void __user *in_ptr = u64_to_user_ptr(in->array); ++ void *out_ptr = out_alloc; ++ ++ /* If the sizes differ, we need to copy elements one by one. */ ++ for (u32 i = 0; i < in->count; i++) { ++ ret = copy_struct_from_user(out_ptr, obj_size, in_ptr, in->stride); ++ if (ret) ++ break; ++ ++ out_ptr += obj_size; ++ in_ptr += in->stride; ++ } ++ } ++ ++ if (ret) { ++ kvfree(out_alloc); ++ return ERR_PTR(ret); ++ } ++ ++ return out_alloc; ++} ++ ++/** ++ * PANTHOR_UOBJ_MIN_SIZE_INTERNAL() - Get the minimum user object size ++ * @_typename: Object type. ++ * @_last_mandatory_field: Last mandatory field. ++ * ++ * Get the minimum user object size based on the last mandatory field name, ++ * A.K.A, the name of the last field of the structure at the time this ++ * structure was added to the uAPI. ++ * ++ * Don't use directly, use PANTHOR_UOBJ_DECL() instead. ++ */ ++#define PANTHOR_UOBJ_MIN_SIZE_INTERNAL(_typename, _last_mandatory_field) \ ++ (offsetof(_typename, _last_mandatory_field) + \ ++ sizeof(((_typename *)NULL)->_last_mandatory_field)) ++ ++/** ++ * PANTHOR_UOBJ_DECL() - Declare a new uAPI object whose subject to ++ * evolutions. ++ * @_typename: Object type. ++ * @_last_mandatory_field: Last mandatory field. ++ * ++ * Should be used to extend the PANTHOR_UOBJ_MIN_SIZE() list. ++ */ ++#define PANTHOR_UOBJ_DECL(_typename, _last_mandatory_field) \ ++ _typename : PANTHOR_UOBJ_MIN_SIZE_INTERNAL(_typename, _last_mandatory_field) ++ ++/** ++ * PANTHOR_UOBJ_MIN_SIZE() - Get the minimum size of a given uAPI object ++ * @_obj_name: Object to get the minimum size of. ++ * ++ * Don't use this macro directly, it's automatically called by ++ * PANTHOR_UOBJ_{SET,GET_ARRAY}(). ++ */ ++#define PANTHOR_UOBJ_MIN_SIZE(_obj_name) \ ++ _Generic(_obj_name, \ ++ PANTHOR_UOBJ_DECL(struct drm_panthor_gpu_info, tiler_present), \ ++ PANTHOR_UOBJ_DECL(struct drm_panthor_csif_info, pad), \ ++ PANTHOR_UOBJ_DECL(struct drm_panthor_sync_op, timeline_value), \ ++ PANTHOR_UOBJ_DECL(struct drm_panthor_queue_submit, syncs), \ ++ PANTHOR_UOBJ_DECL(struct drm_panthor_queue_create, ringbuf_size), \ ++ PANTHOR_UOBJ_DECL(struct drm_panthor_vm_bind_op, syncs)) ++ ++/** ++ * PANTHOR_UOBJ_SET() - Copy a kernel object to a user object. ++ * @_dest_usr_ptr: User pointer to copy to. ++ * @_usr_size: Size of the user object. ++ * @_src_obj: Kernel object to copy (not a pointer). ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++#define PANTHOR_UOBJ_SET(_dest_usr_ptr, _usr_size, _src_obj) \ ++ panthor_set_uobj(_dest_usr_ptr, _usr_size, \ ++ PANTHOR_UOBJ_MIN_SIZE(_src_obj), \ ++ sizeof(_src_obj), &(_src_obj)) ++ ++/** ++ * PANTHOR_UOBJ_GET_ARRAY() - Copy a user object array to a kernel accessible ++ * object array. ++ * @_dest_array: Local variable that will hold the newly allocated kernel ++ * object array. ++ * @_uobj_array: The drm_panthor_obj_array object describing the user object ++ * array. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++#define PANTHOR_UOBJ_GET_ARRAY(_dest_array, _uobj_array) \ ++ ({ \ ++ typeof(_dest_array) _tmp; \ ++ _tmp = panthor_get_uobj_array(_uobj_array, \ ++ PANTHOR_UOBJ_MIN_SIZE((_dest_array)[0]), \ ++ sizeof((_dest_array)[0])); \ ++ if (!IS_ERR(_tmp)) \ ++ _dest_array = _tmp; \ ++ PTR_ERR_OR_ZERO(_tmp); \ ++ }) ++ ++/** ++ * struct panthor_sync_signal - Represent a synchronization object point to attach ++ * our job fence to. ++ * ++ * This structure is here to keep track of fences that are currently bound to ++ * a specific syncobj point. ++ * ++ * At the beginning of a job submission, the fence ++ * is retrieved from the syncobj itself, and can be NULL if no fence was attached ++ * to this point. ++ * ++ * At the end, it points to the fence of the last job that had a ++ * %DRM_PANTHOR_SYNC_OP_SIGNAL on this syncobj. ++ * ++ * With jobs being submitted in batches, the fence might change several times during ++ * the process, allowing one job to wait on a job that's part of the same submission ++ * but appears earlier in the drm_panthor_group_submit::queue_submits array. ++ */ ++struct panthor_sync_signal { ++ /** @node: list_head to track signal ops within a submit operation */ ++ struct list_head node; ++ ++ /** @handle: The syncobj handle. */ ++ u32 handle; ++ ++ /** ++ * @point: The syncobj point. ++ * ++ * Zero for regular syncobjs, and non-zero for timeline syncobjs. ++ */ ++ u64 point; ++ ++ /** ++ * @syncobj: The sync object pointed by @handle. ++ */ ++ struct drm_syncobj *syncobj; ++ ++ /** ++ * @chain: Chain object used to link the new fence to an existing ++ * timeline syncobj. ++ * ++ * NULL for regular syncobj, non-NULL for timeline syncobjs. ++ */ ++ struct dma_fence_chain *chain; ++ ++ /** ++ * @fence: The fence to assign to the syncobj or syncobj-point. ++ */ ++ struct dma_fence *fence; ++}; ++ ++/** ++ * struct panthor_job_ctx - Job context ++ */ ++struct panthor_job_ctx { ++ /** @job: The job that is about to be submitted to drm_sched. */ ++ struct drm_sched_job *job; ++ ++ /** @syncops: Array of sync operations. */ ++ struct drm_panthor_sync_op *syncops; ++ ++ /** @syncop_count: Number of sync operations. */ ++ u32 syncop_count; ++}; ++ ++/** ++ * struct panthor_submit_ctx - Submission context ++ * ++ * Anything that's related to a submission (%DRM_IOCTL_PANTHOR_VM_BIND or ++ * %DRM_IOCTL_PANTHOR_GROUP_SUBMIT) is kept here, so we can automate the ++ * initialization and cleanup steps. ++ */ ++struct panthor_submit_ctx { ++ /** @file: DRM file this submission happens on. */ ++ struct drm_file *file; ++ ++ /** ++ * @signals: List of struct panthor_sync_signal. ++ * ++ * %DRM_PANTHOR_SYNC_OP_SIGNAL operations will be recorded here, ++ * and %DRM_PANTHOR_SYNC_OP_WAIT will first check if an entry ++ * matching the syncobj+point exists before calling ++ * drm_syncobj_find_fence(). This allows us to describe dependencies ++ * existing between jobs that are part of the same batch. ++ */ ++ struct list_head signals; ++ ++ /** @jobs: Array of jobs. */ ++ struct panthor_job_ctx *jobs; ++ ++ /** @job_count: Number of entries in the @jobs array. */ ++ u32 job_count; ++ ++ /** @exec: drm_exec context used to acquire and prepare resv objects. */ ++ struct drm_exec exec; ++}; ++ ++#define PANTHOR_SYNC_OP_FLAGS_MASK \ ++ (DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK | DRM_PANTHOR_SYNC_OP_SIGNAL) ++ ++static bool sync_op_is_signal(const struct drm_panthor_sync_op *sync_op) ++{ ++ return !!(sync_op->flags & DRM_PANTHOR_SYNC_OP_SIGNAL); ++} ++ ++static bool sync_op_is_wait(const struct drm_panthor_sync_op *sync_op) ++{ ++ /* Note that DRM_PANTHOR_SYNC_OP_WAIT == 0 */ ++ return !(sync_op->flags & DRM_PANTHOR_SYNC_OP_SIGNAL); ++} ++ ++/** ++ * panthor_check_sync_op() - Check drm_panthor_sync_op fields ++ * @sync_op: The sync operation to check. ++ * ++ * Return: 0 on success, -EINVAL otherwise. ++ */ ++static int ++panthor_check_sync_op(const struct drm_panthor_sync_op *sync_op) ++{ ++ u8 handle_type; ++ ++ if (sync_op->flags & ~PANTHOR_SYNC_OP_FLAGS_MASK) ++ return -EINVAL; ++ ++ handle_type = sync_op->flags & DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK; ++ if (handle_type != DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ && ++ handle_type != DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ) ++ return -EINVAL; ++ ++ if (handle_type == DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ && ++ sync_op->timeline_value != 0) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/** ++ * panthor_sync_signal_free() - Release resources and free a panthor_sync_signal object ++ * @sig_sync: Signal object to free. ++ */ ++static void ++panthor_sync_signal_free(struct panthor_sync_signal *sig_sync) ++{ ++ if (!sig_sync) ++ return; ++ ++ drm_syncobj_put(sig_sync->syncobj); ++ dma_fence_chain_free(sig_sync->chain); ++ dma_fence_put(sig_sync->fence); ++ kfree(sig_sync); ++} ++ ++/** ++ * panthor_submit_ctx_add_sync_signal() - Add a signal operation to a submit context ++ * @ctx: Context to add the signal operation to. ++ * @handle: Syncobj handle. ++ * @point: Syncobj point. ++ * ++ * Return: 0 on success, otherwise negative error value. ++ */ ++static int ++panthor_submit_ctx_add_sync_signal(struct panthor_submit_ctx *ctx, u32 handle, u64 point) ++{ ++ struct panthor_sync_signal *sig_sync; ++ struct dma_fence *cur_fence; ++ int ret; ++ ++ sig_sync = kzalloc(sizeof(*sig_sync), GFP_KERNEL); ++ if (!sig_sync) ++ return -ENOMEM; ++ ++ sig_sync->handle = handle; ++ sig_sync->point = point; ++ ++ if (point > 0) { ++ sig_sync->chain = dma_fence_chain_alloc(); ++ if (!sig_sync->chain) { ++ ret = -ENOMEM; ++ goto err_free_sig_sync; ++ } ++ } ++ ++ sig_sync->syncobj = drm_syncobj_find(ctx->file, handle); ++ if (!sig_sync->syncobj) { ++ ret = -EINVAL; ++ goto err_free_sig_sync; ++ } ++ ++ /* Retrieve the current fence attached to that point. It's ++ * perfectly fine to get a NULL fence here, it just means there's ++ * no fence attached to that point yet. ++ */ ++ if (!drm_syncobj_find_fence(ctx->file, handle, point, 0, &cur_fence)) ++ sig_sync->fence = cur_fence; ++ ++ list_add_tail(&sig_sync->node, &ctx->signals); ++ ++ return 0; ++ ++err_free_sig_sync: ++ panthor_sync_signal_free(sig_sync); ++ return ret; ++} ++ ++/** ++ * panthor_submit_ctx_search_sync_signal() - Search an existing signal operation in a ++ * submit context. ++ * @ctx: Context to search the signal operation in. ++ * @handle: Syncobj handle. ++ * @point: Syncobj point. ++ * ++ * Return: A valid panthor_sync_signal object if found, NULL otherwise. ++ */ ++static struct panthor_sync_signal * ++panthor_submit_ctx_search_sync_signal(struct panthor_submit_ctx *ctx, u32 handle, u64 point) ++{ ++ struct panthor_sync_signal *sig_sync; ++ ++ list_for_each_entry(sig_sync, &ctx->signals, node) { ++ if (handle == sig_sync->handle && point == sig_sync->point) ++ return sig_sync; ++ } ++ ++ return NULL; ++} ++ ++/** ++ * panthor_submit_ctx_add_job() - Add a job to a submit context ++ * @ctx: Context to search the signal operation in. ++ * @idx: Index of the job in the context. ++ * @job: Job to add. ++ * @syncs: Sync operations provided by userspace. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_add_job(struct panthor_submit_ctx *ctx, u32 idx, ++ struct drm_sched_job *job, ++ const struct drm_panthor_obj_array *syncs) ++{ ++ int ret; ++ ++ ctx->jobs[idx].job = job; ++ ++ ret = PANTHOR_UOBJ_GET_ARRAY(ctx->jobs[idx].syncops, syncs); ++ if (ret) ++ return ret; ++ ++ ctx->jobs[idx].syncop_count = syncs->count; ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_get_sync_signal() - Search signal operation and add one if none was found. ++ * @ctx: Context to search the signal operation in. ++ * @handle: Syncobj handle. ++ * @point: Syncobj point. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_get_sync_signal(struct panthor_submit_ctx *ctx, u32 handle, u64 point) ++{ ++ struct panthor_sync_signal *sig_sync; ++ ++ sig_sync = panthor_submit_ctx_search_sync_signal(ctx, handle, point); ++ if (sig_sync) ++ return 0; ++ ++ return panthor_submit_ctx_add_sync_signal(ctx, handle, point); ++} ++ ++/** ++ * panthor_submit_ctx_update_job_sync_signal_fences() - Update fences ++ * on the signal operations specified by a job. ++ * @ctx: Context to search the signal operation in. ++ * @job_idx: Index of the job to operate on. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_update_job_sync_signal_fences(struct panthor_submit_ctx *ctx, ++ u32 job_idx) ++{ ++ struct panthor_device *ptdev = container_of(ctx->file->minor->dev, ++ struct panthor_device, ++ base); ++ struct dma_fence *done_fence = &ctx->jobs[job_idx].job->s_fence->finished; ++ const struct drm_panthor_sync_op *sync_ops = ctx->jobs[job_idx].syncops; ++ u32 sync_op_count = ctx->jobs[job_idx].syncop_count; ++ ++ for (u32 i = 0; i < sync_op_count; i++) { ++ struct dma_fence *old_fence; ++ struct panthor_sync_signal *sig_sync; ++ ++ if (!sync_op_is_signal(&sync_ops[i])) ++ continue; ++ ++ sig_sync = panthor_submit_ctx_search_sync_signal(ctx, sync_ops[i].handle, ++ sync_ops[i].timeline_value); ++ if (drm_WARN_ON(&ptdev->base, !sig_sync)) ++ return -EINVAL; ++ ++ old_fence = sig_sync->fence; ++ sig_sync->fence = dma_fence_get(done_fence); ++ dma_fence_put(old_fence); ++ ++ if (drm_WARN_ON(&ptdev->base, !sig_sync->fence)) ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_collect_job_signal_ops() - Iterate over all job signal operations ++ * and add them to the context. ++ * @ctx: Context to search the signal operation in. ++ * @job_idx: Index of the job to operate on. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_collect_job_signal_ops(struct panthor_submit_ctx *ctx, ++ u32 job_idx) ++{ ++ const struct drm_panthor_sync_op *sync_ops = ctx->jobs[job_idx].syncops; ++ u32 sync_op_count = ctx->jobs[job_idx].syncop_count; ++ ++ for (u32 i = 0; i < sync_op_count; i++) { ++ int ret; ++ ++ if (!sync_op_is_signal(&sync_ops[i])) ++ continue; ++ ++ ret = panthor_check_sync_op(&sync_ops[i]); ++ if (ret) ++ return ret; ++ ++ ret = panthor_submit_ctx_get_sync_signal(ctx, ++ sync_ops[i].handle, ++ sync_ops[i].timeline_value); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_push_fences() - Iterate over the signal array, and for each entry, push ++ * the currently assigned fence to the associated syncobj. ++ * @ctx: Context to push fences on. ++ * ++ * This is the last step of a submission procedure, and is done once we know the submission ++ * is effective and job fences are guaranteed to be signaled in finite time. ++ */ ++static void ++panthor_submit_ctx_push_fences(struct panthor_submit_ctx *ctx) ++{ ++ struct panthor_sync_signal *sig_sync; ++ ++ list_for_each_entry(sig_sync, &ctx->signals, node) { ++ if (sig_sync->chain) { ++ drm_syncobj_add_point(sig_sync->syncobj, sig_sync->chain, ++ sig_sync->fence, sig_sync->point); ++ sig_sync->chain = NULL; ++ } else { ++ drm_syncobj_replace_fence(sig_sync->syncobj, sig_sync->fence); ++ } ++ } ++} ++ ++/** ++ * panthor_submit_ctx_add_sync_deps_to_job() - Add sync wait operations as ++ * job dependencies. ++ * @ctx: Submit context. ++ * @job_idx: Index of the job to operate on. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_add_sync_deps_to_job(struct panthor_submit_ctx *ctx, ++ u32 job_idx) ++{ ++ struct panthor_device *ptdev = container_of(ctx->file->minor->dev, ++ struct panthor_device, ++ base); ++ const struct drm_panthor_sync_op *sync_ops = ctx->jobs[job_idx].syncops; ++ struct drm_sched_job *job = ctx->jobs[job_idx].job; ++ u32 sync_op_count = ctx->jobs[job_idx].syncop_count; ++ int ret = 0; ++ ++ for (u32 i = 0; i < sync_op_count; i++) { ++ struct panthor_sync_signal *sig_sync; ++ struct dma_fence *fence; ++ ++ if (!sync_op_is_wait(&sync_ops[i])) ++ continue; ++ ++ ret = panthor_check_sync_op(&sync_ops[i]); ++ if (ret) ++ return ret; ++ ++ sig_sync = panthor_submit_ctx_search_sync_signal(ctx, sync_ops[i].handle, ++ sync_ops[i].timeline_value); ++ if (sig_sync) { ++ if (drm_WARN_ON(&ptdev->base, !sig_sync->fence)) ++ return -EINVAL; ++ ++ fence = dma_fence_get(sig_sync->fence); ++ } else { ++ ret = drm_syncobj_find_fence(ctx->file, sync_ops[i].handle, ++ sync_ops[i].timeline_value, ++ 0, &fence); ++ if (ret) ++ return ret; ++ } ++ ++ ret = drm_sched_job_add_dependency(job, fence); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_collect_jobs_signal_ops() - Collect all signal operations ++ * and add them to the submit context. ++ * @ctx: Submit context. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_collect_jobs_signal_ops(struct panthor_submit_ctx *ctx) ++{ ++ for (u32 i = 0; i < ctx->job_count; i++) { ++ int ret; ++ ++ ret = panthor_submit_ctx_collect_job_signal_ops(ctx, i); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_add_deps_and_arm_jobs() - Add jobs dependencies and arm jobs ++ * @ctx: Submit context. ++ * ++ * Must be called after the resv preparation has been taken care of. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int ++panthor_submit_ctx_add_deps_and_arm_jobs(struct panthor_submit_ctx *ctx) ++{ ++ for (u32 i = 0; i < ctx->job_count; i++) { ++ int ret; ++ ++ ret = panthor_submit_ctx_add_sync_deps_to_job(ctx, i); ++ if (ret) ++ return ret; ++ ++ drm_sched_job_arm(ctx->jobs[i].job); ++ ++ ret = panthor_submit_ctx_update_job_sync_signal_fences(ctx, i); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_push_jobs() - Push jobs to their scheduling entities. ++ * @ctx: Submit context. ++ * @upd_resvs: Callback used to update reservation objects that were previously ++ * preapred. ++ */ ++static void ++panthor_submit_ctx_push_jobs(struct panthor_submit_ctx *ctx, ++ void (*upd_resvs)(struct drm_exec *, struct drm_sched_job *)) ++{ ++ for (u32 i = 0; i < ctx->job_count; i++) { ++ upd_resvs(&ctx->exec, ctx->jobs[i].job); ++ drm_sched_entity_push_job(ctx->jobs[i].job); ++ ++ /* Job is owned by the scheduler now. */ ++ ctx->jobs[i].job = NULL; ++ } ++ ++ panthor_submit_ctx_push_fences(ctx); ++} ++ ++/** ++ * panthor_submit_ctx_init() - Initializes a submission context ++ * @ctx: Submit context to initialize. ++ * @file: drm_file this submission happens on. ++ * @job_count: Number of jobs that will be submitted. ++ * ++ * Return: 0 on success, a negative error code otherwise. ++ */ ++static int panthor_submit_ctx_init(struct panthor_submit_ctx *ctx, ++ struct drm_file *file, u32 job_count) ++{ ++ ctx->jobs = kvmalloc_array(job_count, sizeof(*ctx->jobs), ++ GFP_KERNEL | __GFP_ZERO); ++ if (!ctx->jobs) ++ return -ENOMEM; ++ ++ ctx->file = file; ++ ctx->job_count = job_count; ++ INIT_LIST_HEAD(&ctx->signals); ++ drm_exec_init(&ctx->exec, ++ DRM_EXEC_INTERRUPTIBLE_WAIT | DRM_EXEC_IGNORE_DUPLICATES, ++ 0); ++ return 0; ++} ++ ++/** ++ * panthor_submit_ctx_cleanup() - Cleanup a submission context ++ * @ctx: Submit context to cleanup. ++ * @job_put: Job put callback. ++ */ ++static void panthor_submit_ctx_cleanup(struct panthor_submit_ctx *ctx, ++ void (*job_put)(struct drm_sched_job *)) ++{ ++ struct panthor_sync_signal *sig_sync, *tmp; ++ unsigned long i; ++ ++ drm_exec_fini(&ctx->exec); ++ ++ list_for_each_entry_safe(sig_sync, tmp, &ctx->signals, node) ++ panthor_sync_signal_free(sig_sync); ++ ++ for (i = 0; i < ctx->job_count; i++) { ++ job_put(ctx->jobs[i].job); ++ kvfree(ctx->jobs[i].syncops); ++ } ++ ++ kvfree(ctx->jobs); ++} ++ ++static int panthor_ioctl_dev_query(struct drm_device *ddev, void *data, struct drm_file *file) ++{ ++ struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); ++ struct drm_panthor_dev_query *args = data; ++ ++ if (!args->pointer) { ++ switch (args->type) { ++ case DRM_PANTHOR_DEV_QUERY_GPU_INFO: ++ args->size = sizeof(ptdev->gpu_info); ++ return 0; ++ ++ case DRM_PANTHOR_DEV_QUERY_CSIF_INFO: ++ args->size = sizeof(ptdev->csif_info); ++ return 0; ++ ++ default: ++ return -EINVAL; ++ } ++ } ++ ++ switch (args->type) { ++ case DRM_PANTHOR_DEV_QUERY_GPU_INFO: ++ return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->gpu_info); ++ ++ case DRM_PANTHOR_DEV_QUERY_CSIF_INFO: ++ return PANTHOR_UOBJ_SET(args->pointer, args->size, ptdev->csif_info); ++ ++ default: ++ return -EINVAL; ++ } ++} ++ ++#define PANTHOR_VM_CREATE_FLAGS 0 ++ ++static int panthor_ioctl_vm_create(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_vm_create *args = data; ++ int cookie, ret; ++ ++ if (!drm_dev_enter(ddev, &cookie)) ++ return -ENODEV; ++ ++ ret = panthor_vm_pool_create_vm(ptdev, pfile->vms, args); ++ if (ret >= 0) { ++ args->id = ret; ++ ret = 0; ++ } ++ ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static int panthor_ioctl_vm_destroy(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_vm_destroy *args = data; ++ ++ if (args->pad) ++ return -EINVAL; ++ ++ return panthor_vm_pool_destroy_vm(pfile->vms, args->id); ++} ++ ++#define PANTHOR_BO_FLAGS DRM_PANTHOR_BO_NO_MMAP ++ ++static int panthor_ioctl_bo_create(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_bo_create *args = data; ++ struct panthor_vm *vm = NULL; ++ int cookie, ret; ++ ++ if (!drm_dev_enter(ddev, &cookie)) ++ return -ENODEV; ++ ++ if (!args->size || args->pad || ++ (args->flags & ~PANTHOR_BO_FLAGS)) { ++ ret = -EINVAL; ++ goto out_dev_exit; ++ } ++ ++ if (args->exclusive_vm_id) { ++ vm = panthor_vm_pool_get_vm(pfile->vms, args->exclusive_vm_id); ++ if (!vm) { ++ ret = -EINVAL; ++ goto out_dev_exit; ++ } ++ } ++ ++ ret = panthor_gem_create_with_handle(file, ddev, vm, &args->size, ++ args->flags, &args->handle); ++ ++ panthor_vm_put(vm); ++ ++out_dev_exit: ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static int panthor_ioctl_bo_mmap_offset(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct drm_panthor_bo_mmap_offset *args = data; ++ struct drm_gem_object *obj; ++ int ret; ++ ++ if (args->pad) ++ return -EINVAL; ++ ++ obj = drm_gem_object_lookup(file, args->handle); ++ if (!obj) ++ return -ENOENT; ++ ++ ret = drm_gem_create_mmap_offset(obj); ++ if (ret) ++ goto out; ++ ++ args->offset = drm_vma_node_offset_addr(&obj->vma_node); ++ ++out: ++ drm_gem_object_put(obj); ++ return ret; ++} ++ ++static int panthor_ioctl_group_submit(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_group_submit *args = data; ++ struct drm_panthor_queue_submit *jobs_args; ++ struct panthor_submit_ctx ctx; ++ int ret = 0, cookie; ++ ++ if (args->pad) ++ return -EINVAL; ++ ++ if (!drm_dev_enter(ddev, &cookie)) ++ return -ENODEV; ++ ++ ret = PANTHOR_UOBJ_GET_ARRAY(jobs_args, &args->queue_submits); ++ if (ret) ++ goto out_dev_exit; ++ ++ ret = panthor_submit_ctx_init(&ctx, file, args->queue_submits.count); ++ if (ret) ++ goto out_free_jobs_args; ++ ++ /* Create jobs and attach sync operations */ ++ for (u32 i = 0; i < args->queue_submits.count; i++) { ++ const struct drm_panthor_queue_submit *qsubmit = &jobs_args[i]; ++ struct drm_sched_job *job; ++ ++ job = panthor_job_create(pfile, args->group_handle, qsubmit); ++ if (IS_ERR(job)) { ++ ret = PTR_ERR(job); ++ goto out_cleanup_submit_ctx; ++ } ++ ++ ret = panthor_submit_ctx_add_job(&ctx, i, job, &qsubmit->syncs); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ } ++ ++ /* ++ * Collect signal operations on all jobs, such that each job can pick ++ * from it for its dependencies and update the fence to signal when the ++ * job is submitted. ++ */ ++ ret = panthor_submit_ctx_collect_jobs_signal_ops(&ctx); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ ++ /* ++ * We acquire/prepare revs on all jobs before proceeding with the ++ * dependency registration. ++ * ++ * This is solving two problems: ++ * 1. drm_sched_job_arm() and drm_sched_entity_push_job() must be ++ * protected by a lock to make sure no concurrent access to the same ++ * entity get interleaved, which would mess up with the fence seqno ++ * ordering. Luckily, one of the resv being acquired is the VM resv, ++ * and a scheduling entity is only bound to a single VM. As soon as ++ * we acquire the VM resv, we should be safe. ++ * 2. Jobs might depend on fences that were issued by previous jobs in ++ * the same batch, so we can't add dependencies on all jobs before ++ * arming previous jobs and registering the fence to the signal ++ * array, otherwise we might miss dependencies, or point to an ++ * outdated fence. ++ */ ++ if (args->queue_submits.count > 0) { ++ /* All jobs target the same group, so they also point to the same VM. */ ++ struct panthor_vm *vm = panthor_job_vm(ctx.jobs[0].job); ++ ++ drm_exec_until_all_locked(&ctx.exec) { ++ ret = panthor_vm_prepare_mapped_bos_resvs(&ctx.exec, vm, ++ args->queue_submits.count); ++ } ++ ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ } ++ ++ /* ++ * Now that resvs are locked/prepared, we can iterate over each job to ++ * add the dependencies, arm the job fence, register the job fence to ++ * the signal array. ++ */ ++ ret = panthor_submit_ctx_add_deps_and_arm_jobs(&ctx); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ ++ /* Nothing can fail after that point, so we can make our job fences ++ * visible to the outside world. Push jobs and set the job fences to ++ * the resv slots we reserved. This also pushes the fences to the ++ * syncobjs that are part of the signal array. ++ */ ++ panthor_submit_ctx_push_jobs(&ctx, panthor_job_update_resvs); ++ ++out_cleanup_submit_ctx: ++ panthor_submit_ctx_cleanup(&ctx, panthor_job_put); ++ ++out_free_jobs_args: ++ kvfree(jobs_args); ++ ++out_dev_exit: ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static int panthor_ioctl_group_destroy(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_group_destroy *args = data; ++ ++ if (args->pad) ++ return -EINVAL; ++ ++ return panthor_group_destroy(pfile, args->group_handle); ++} ++ ++static int panthor_ioctl_group_create(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_group_create *args = data; ++ struct drm_panthor_queue_create *queue_args; ++ int ret; ++ ++ if (!args->queues.count) ++ return -EINVAL; ++ ++ ret = PANTHOR_UOBJ_GET_ARRAY(queue_args, &args->queues); ++ if (ret) ++ return ret; ++ ++ ret = panthor_group_create(pfile, args, queue_args); ++ if (ret >= 0) { ++ args->group_handle = ret; ++ ret = 0; ++ } ++ ++ kvfree(queue_args); ++ return ret; ++} ++ ++static int panthor_ioctl_group_get_state(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_group_get_state *args = data; ++ ++ return panthor_group_get_state(pfile, args); ++} ++ ++static int panthor_ioctl_tiler_heap_create(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_tiler_heap_create *args = data; ++ struct panthor_heap_pool *pool; ++ struct panthor_vm *vm; ++ int ret; ++ ++ vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id); ++ if (!vm) ++ return -EINVAL; ++ ++ pool = panthor_vm_get_heap_pool(vm, true); ++ if (IS_ERR(pool)) { ++ ret = PTR_ERR(pool); ++ goto out_put_vm; ++ } ++ ++ ret = panthor_heap_create(pool, ++ args->initial_chunk_count, ++ args->chunk_size, ++ args->max_chunks, ++ args->target_in_flight, ++ &args->tiler_heap_ctx_gpu_va, ++ &args->first_heap_chunk_gpu_va); ++ if (ret < 0) ++ goto out_put_heap_pool; ++ ++ /* Heap pools are per-VM. We combine the VM and HEAP id to make ++ * a unique heap handle. ++ */ ++ args->handle = (args->vm_id << 16) | ret; ++ ret = 0; ++ ++out_put_heap_pool: ++ panthor_heap_pool_put(pool); ++ ++out_put_vm: ++ panthor_vm_put(vm); ++ return ret; ++} ++ ++static int panthor_ioctl_tiler_heap_destroy(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_tiler_heap_destroy *args = data; ++ struct panthor_heap_pool *pool; ++ struct panthor_vm *vm; ++ int ret; ++ ++ if (args->pad) ++ return -EINVAL; ++ ++ vm = panthor_vm_pool_get_vm(pfile->vms, args->handle >> 16); ++ if (!vm) ++ return -EINVAL; ++ ++ pool = panthor_vm_get_heap_pool(vm, false); ++ if (!pool) { ++ ret = -EINVAL; ++ goto out_put_vm; ++ } ++ ++ ret = panthor_heap_destroy(pool, args->handle & GENMASK(15, 0)); ++ panthor_heap_pool_put(pool); ++ ++out_put_vm: ++ panthor_vm_put(vm); ++ return ret; ++} ++ ++static int panthor_ioctl_vm_bind_async(struct drm_device *ddev, ++ struct drm_panthor_vm_bind *args, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_vm_bind_op *jobs_args; ++ struct panthor_submit_ctx ctx; ++ struct panthor_vm *vm; ++ int ret = 0; ++ ++ vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id); ++ if (!vm) ++ return -EINVAL; ++ ++ ret = PANTHOR_UOBJ_GET_ARRAY(jobs_args, &args->ops); ++ if (ret) ++ goto out_put_vm; ++ ++ ret = panthor_submit_ctx_init(&ctx, file, args->ops.count); ++ if (ret) ++ goto out_free_jobs_args; ++ ++ for (u32 i = 0; i < args->ops.count; i++) { ++ struct drm_panthor_vm_bind_op *op = &jobs_args[i]; ++ struct drm_sched_job *job; ++ ++ job = panthor_vm_bind_job_create(file, vm, op); ++ if (IS_ERR(job)) { ++ ret = PTR_ERR(job); ++ goto out_cleanup_submit_ctx; ++ } ++ ++ ret = panthor_submit_ctx_add_job(&ctx, i, job, &op->syncs); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ } ++ ++ ret = panthor_submit_ctx_collect_jobs_signal_ops(&ctx); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ ++ /* Prepare reservation objects for each VM_BIND job. */ ++ drm_exec_until_all_locked(&ctx.exec) { ++ for (u32 i = 0; i < ctx.job_count; i++) { ++ ret = panthor_vm_bind_job_prepare_resvs(&ctx.exec, ctx.jobs[i].job); ++ drm_exec_retry_on_contention(&ctx.exec); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ } ++ } ++ ++ ret = panthor_submit_ctx_add_deps_and_arm_jobs(&ctx); ++ if (ret) ++ goto out_cleanup_submit_ctx; ++ ++ /* Nothing can fail after that point. */ ++ panthor_submit_ctx_push_jobs(&ctx, panthor_vm_bind_job_update_resvs); ++ ++out_cleanup_submit_ctx: ++ panthor_submit_ctx_cleanup(&ctx, panthor_vm_bind_job_put); ++ ++out_free_jobs_args: ++ kvfree(jobs_args); ++ ++out_put_vm: ++ panthor_vm_put(vm); ++ return ret; ++} ++ ++static int panthor_ioctl_vm_bind_sync(struct drm_device *ddev, ++ struct drm_panthor_vm_bind *args, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_vm_bind_op *jobs_args; ++ struct panthor_vm *vm; ++ int ret; ++ ++ vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id); ++ if (!vm) ++ return -EINVAL; ++ ++ ret = PANTHOR_UOBJ_GET_ARRAY(jobs_args, &args->ops); ++ if (ret) ++ goto out_put_vm; ++ ++ for (u32 i = 0; i < args->ops.count; i++) { ++ ret = panthor_vm_bind_exec_sync_op(file, vm, &jobs_args[i]); ++ if (ret) { ++ /* Update ops.count so the user knows where things failed. */ ++ args->ops.count = i; ++ break; ++ } ++ } ++ ++ kvfree(jobs_args); ++ ++out_put_vm: ++ panthor_vm_put(vm); ++ return ret; ++} ++ ++#define PANTHOR_VM_BIND_FLAGS DRM_PANTHOR_VM_BIND_ASYNC ++ ++static int panthor_ioctl_vm_bind(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct drm_panthor_vm_bind *args = data; ++ int cookie, ret; ++ ++ if (!drm_dev_enter(ddev, &cookie)) ++ return -ENODEV; ++ ++ if (args->flags & DRM_PANTHOR_VM_BIND_ASYNC) ++ ret = panthor_ioctl_vm_bind_async(ddev, args, file); ++ else ++ ret = panthor_ioctl_vm_bind_sync(ddev, args, file); ++ ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static int panthor_ioctl_vm_get_state(struct drm_device *ddev, void *data, ++ struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ struct drm_panthor_vm_get_state *args = data; ++ struct panthor_vm *vm; ++ ++ vm = panthor_vm_pool_get_vm(pfile->vms, args->vm_id); ++ if (!vm) ++ return -EINVAL; ++ ++ if (panthor_vm_is_unusable(vm)) ++ args->state = DRM_PANTHOR_VM_STATE_UNUSABLE; ++ else ++ args->state = DRM_PANTHOR_VM_STATE_USABLE; ++ ++ panthor_vm_put(vm); ++ return 0; ++} ++ ++static int ++panthor_open(struct drm_device *ddev, struct drm_file *file) ++{ ++ struct panthor_device *ptdev = container_of(ddev, struct panthor_device, base); ++ struct panthor_file *pfile; ++ int ret; ++ ++ if (!try_module_get(THIS_MODULE)) ++ return -EINVAL; ++ ++ pfile = kzalloc(sizeof(*pfile), GFP_KERNEL); ++ if (!pfile) { ++ ret = -ENOMEM; ++ goto err_put_mod; ++ } ++ ++ pfile->ptdev = ptdev; ++ ++ ret = panthor_vm_pool_create(pfile); ++ if (ret) ++ goto err_free_file; ++ ++ ret = panthor_group_pool_create(pfile); ++ if (ret) ++ goto err_destroy_vm_pool; ++ ++ file->driver_priv = pfile; ++ return 0; ++ ++err_destroy_vm_pool: ++ panthor_vm_pool_destroy(pfile); ++ ++err_free_file: ++ kfree(pfile); ++ ++err_put_mod: ++ module_put(THIS_MODULE); ++ return ret; ++} ++ ++static void ++panthor_postclose(struct drm_device *ddev, struct drm_file *file) ++{ ++ struct panthor_file *pfile = file->driver_priv; ++ ++ panthor_group_pool_destroy(pfile); ++ panthor_vm_pool_destroy(pfile); ++ ++ kfree(pfile); ++ module_put(THIS_MODULE); ++} ++ ++static const struct drm_ioctl_desc panthor_drm_driver_ioctls[] = { ++#define PANTHOR_IOCTL(n, func, flags) \ ++ DRM_IOCTL_DEF_DRV(PANTHOR_##n, panthor_ioctl_##func, flags) ++ ++ PANTHOR_IOCTL(DEV_QUERY, dev_query, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(VM_CREATE, vm_create, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(VM_DESTROY, vm_destroy, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(VM_BIND, vm_bind, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(VM_GET_STATE, vm_get_state, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(BO_CREATE, bo_create, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(BO_MMAP_OFFSET, bo_mmap_offset, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(GROUP_CREATE, group_create, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(GROUP_DESTROY, group_destroy, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(GROUP_GET_STATE, group_get_state, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(TILER_HEAP_CREATE, tiler_heap_create, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(TILER_HEAP_DESTROY, tiler_heap_destroy, DRM_RENDER_ALLOW), ++ PANTHOR_IOCTL(GROUP_SUBMIT, group_submit, DRM_RENDER_ALLOW), ++}; ++ ++static int panthor_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ struct drm_file *file = filp->private_data; ++ struct panthor_file *pfile = file->driver_priv; ++ struct panthor_device *ptdev = pfile->ptdev; ++ u64 offset = (u64)vma->vm_pgoff << PAGE_SHIFT; ++ int ret, cookie; ++ ++ if (!drm_dev_enter(file->minor->dev, &cookie)) ++ return -ENODEV; ++ ++ if (panthor_device_mmio_offset(offset) >= DRM_PANTHOR_USER_MMIO_OFFSET) ++ ret = panthor_device_mmap_io(ptdev, vma); ++ else ++ ret = drm_gem_mmap(filp, vma); ++ ++ drm_dev_exit(cookie); ++ return ret; ++} ++ ++static const struct file_operations panthor_drm_driver_fops = { ++ .open = drm_open, ++ .release = drm_release, ++ .unlocked_ioctl = drm_ioctl, ++ .compat_ioctl = drm_compat_ioctl, ++ .poll = drm_poll, ++ .read = drm_read, ++ .llseek = noop_llseek, ++ .mmap = panthor_mmap, ++}; ++ ++#ifdef CONFIG_DEBUG_FS ++static void panthor_debugfs_init(struct drm_minor *minor) ++{ ++ panthor_mmu_debugfs_init(minor); ++} ++#endif ++ ++/* ++ * PanCSF driver version: ++ * - 1.0 - initial interface ++ */ ++static const struct drm_driver panthor_drm_driver = { ++ .driver_features = DRIVER_RENDER | DRIVER_GEM | DRIVER_SYNCOBJ | ++ DRIVER_SYNCOBJ_TIMELINE | DRIVER_GEM_GPUVA, ++ .open = panthor_open, ++ .postclose = panthor_postclose, ++ .ioctls = panthor_drm_driver_ioctls, ++ .num_ioctls = ARRAY_SIZE(panthor_drm_driver_ioctls), ++ .fops = &panthor_drm_driver_fops, ++ .name = "panthor", ++ .desc = "Panthor DRM driver", ++ .date = "20230801", ++ .major = 1, ++ .minor = 0, ++ ++ .gem_create_object = panthor_gem_create_object, ++ .gem_prime_import_sg_table = drm_gem_shmem_prime_import_sg_table, ++#ifdef CONFIG_DEBUG_FS ++ .debugfs_init = panthor_debugfs_init, ++#endif ++}; ++ ++static int panthor_probe(struct platform_device *pdev) ++{ ++ struct panthor_device *ptdev; ++ ++ ptdev = devm_drm_dev_alloc(&pdev->dev, &panthor_drm_driver, ++ struct panthor_device, base); ++ if (!ptdev) ++ return -ENOMEM; ++ ++ platform_set_drvdata(pdev, ptdev); ++ ++ return panthor_device_init(ptdev); ++} ++ ++static void panthor_remove(struct platform_device *pdev) ++{ ++ struct panthor_device *ptdev = platform_get_drvdata(pdev); ++ ++ panthor_device_unplug(ptdev); ++} ++ ++static const struct of_device_id dt_match[] = { ++ { .compatible = "rockchip,rk3588-mali" }, ++ { .compatible = "arm,mali-valhall-csf" }, ++ {} ++}; ++MODULE_DEVICE_TABLE(of, dt_match); ++ ++static DEFINE_RUNTIME_DEV_PM_OPS(panthor_pm_ops, ++ panthor_device_suspend, ++ panthor_device_resume, ++ NULL); ++ ++static struct platform_driver panthor_driver = { ++ .probe = panthor_probe, ++ .remove_new = panthor_remove, ++ .driver = { ++ .name = "panthor", ++ .pm = &panthor_pm_ops, ++ .of_match_table = dt_match, ++ }, ++}; ++ ++/* ++ * Workqueue used to cleanup stuff. ++ * ++ * We create a dedicated workqueue so we can drain on unplug and ++ * make sure all resources are freed before the module is unloaded. ++ */ ++struct workqueue_struct *panthor_cleanup_wq; ++ ++static int __init panthor_init(void) ++{ ++ int ret; ++ ++ ret = panthor_mmu_pt_cache_init(); ++ if (ret) ++ return ret; ++ ++ panthor_cleanup_wq = alloc_workqueue("panthor-cleanup", WQ_UNBOUND, 0); ++ if (!panthor_cleanup_wq) { ++ pr_err("panthor: Failed to allocate the workqueues"); ++ ret = -ENOMEM; ++ goto err_mmu_pt_cache_fini; ++ } ++ ++ ret = platform_driver_register(&panthor_driver); ++ if (ret) ++ goto err_destroy_cleanup_wq; ++ ++ return 0; ++ ++err_destroy_cleanup_wq: ++ destroy_workqueue(panthor_cleanup_wq); ++ ++err_mmu_pt_cache_fini: ++ panthor_mmu_pt_cache_fini(); ++ return ret; ++} ++module_init(panthor_init); ++ ++static void __exit panthor_exit(void) ++{ ++ platform_driver_unregister(&panthor_driver); ++ destroy_workqueue(panthor_cleanup_wq); ++ panthor_mmu_pt_cache_fini(); ++} ++module_exit(panthor_exit); ++ ++MODULE_AUTHOR("Panthor Project Developers"); ++MODULE_DESCRIPTION("Panthor DRM Driver"); ++MODULE_LICENSE("Dual MIT/GPL"); +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Thu, 29 Feb 2024 17:22:26 +0100 +Subject: drm/panthor: Allow driver compilation + +Now that all blocks are available, we can add/update Kconfig/Makefile +files to allow compilation. + +v6: +- Add Maxime's and Heiko's acks +- Keep source files alphabetically ordered in the Makefile + +v4: +- Add Steve's R-b + +v3: +- Add a dep on DRM_GPUVM +- Fix dependencies in Kconfig +- Expand help text to (hopefully) describe which GPUs are to be + supported by this driver and which are for panfrost. + +Co-developed-by: Steven Price +Signed-off-by: Steven Price +Signed-off-by: Boris Brezillon +Acked-by: Steven Price # MIT+GPL2 relicensing,Arm +Acked-by: Grant Likely # MIT+GPL2 relicensing,Linaro +Acked-by: Boris Brezillon # MIT+GPL2 relicensing,Collabora +Reviewed-by: Steven Price +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + drivers/gpu/drm/Kconfig | 2 + + drivers/gpu/drm/Makefile | 1 + + drivers/gpu/drm/panthor/Kconfig | 23 ++++++++++ + drivers/gpu/drm/panthor/Makefile | 14 ++++++ + 4 files changed, 40 insertions(+) + +diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig +index c7edba18a6f0..b94a2fe4f462 100644 +--- a/drivers/gpu/drm/Kconfig ++++ b/drivers/gpu/drm/Kconfig +@@ -384,6 +384,8 @@ source "drivers/gpu/drm/lima/Kconfig" + + source "drivers/gpu/drm/panfrost/Kconfig" + ++source "drivers/gpu/drm/panthor/Kconfig" ++ + source "drivers/gpu/drm/aspeed/Kconfig" + + source "drivers/gpu/drm/mcde/Kconfig" +diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile +index 104b42df2e95..6eb2b553a163 100644 +--- a/drivers/gpu/drm/Makefile ++++ b/drivers/gpu/drm/Makefile +@@ -179,6 +179,7 @@ obj-$(CONFIG_DRM_XEN) += xen/ + obj-$(CONFIG_DRM_VBOXVIDEO) += vboxvideo/ + obj-$(CONFIG_DRM_LIMA) += lima/ + obj-$(CONFIG_DRM_PANFROST) += panfrost/ ++obj-$(CONFIG_DRM_PANTHOR) += panthor/ + obj-$(CONFIG_DRM_ASPEED_GFX) += aspeed/ + obj-$(CONFIG_DRM_MCDE) += mcde/ + obj-$(CONFIG_DRM_TIDSS) += tidss/ +diff --git a/drivers/gpu/drm/panthor/Kconfig b/drivers/gpu/drm/panthor/Kconfig +new file mode 100644 +index 000000000000..55b40ad07f3b +--- /dev/null ++++ b/drivers/gpu/drm/panthor/Kconfig +@@ -0,0 +1,23 @@ ++# SPDX-License-Identifier: GPL-2.0 or MIT ++ ++config DRM_PANTHOR ++ tristate "Panthor (DRM support for ARM Mali CSF-based GPUs)" ++ depends on DRM ++ depends on ARM || ARM64 || COMPILE_TEST ++ depends on !GENERIC_ATOMIC64 # for IOMMU_IO_PGTABLE_LPAE ++ depends on MMU ++ select DEVFREQ_GOV_SIMPLE_ONDEMAND ++ select DRM_EXEC ++ select DRM_GEM_SHMEM_HELPER ++ select DRM_GPUVM ++ select DRM_SCHED ++ select IOMMU_IO_PGTABLE_LPAE ++ select IOMMU_SUPPORT ++ select PM_DEVFREQ ++ help ++ DRM driver for ARM Mali CSF-based GPUs. ++ ++ This driver is for Mali (or Immortalis) Valhall Gxxx GPUs. ++ ++ Note that the Mali-G68 and Mali-G78, while Valhall architecture, will ++ be supported with the panfrost driver as they are not CSF GPUs. +diff --git a/drivers/gpu/drm/panthor/Makefile b/drivers/gpu/drm/panthor/Makefile +new file mode 100644 +index 000000000000..15294719b09c +--- /dev/null ++++ b/drivers/gpu/drm/panthor/Makefile +@@ -0,0 +1,14 @@ ++# SPDX-License-Identifier: GPL-2.0 or MIT ++ ++panthor-y := \ ++ panthor_devfreq.o \ ++ panthor_device.o \ ++ panthor_drv.o \ ++ panthor_fw.o \ ++ panthor_gem.o \ ++ panthor_gpu.o \ ++ panthor_heap.o \ ++ panthor_mmu.o \ ++ panthor_sched.o ++ ++obj-$(CONFIG_DRM_PANTHOR) += panthor.o +-- +Armbian + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Liviu Dudau +Date: Thu, 29 Feb 2024 17:22:27 +0100 +Subject: dt-bindings: gpu: mali-valhall-csf: Add support for Arm Mali CSF GPUs + +Arm has introduced a new v10 GPU architecture that replaces the Job Manager +interface with a new Command Stream Frontend. It adds firmware driven +command stream queues that can be used by kernel and user space to submit +jobs to the GPU. + +Add the initial schema for the device tree that is based on support for +RK3588 SoC. The minimum number of clocks is one for the IP, but on Rockchip +platforms they will tend to expose the semi-independent clocks for better +power management. + +v6: +- Add Maxime's and Heiko's acks + +v5: +- Move the opp-table node under the gpu node + +v4: +- Fix formatting issue + +v3: +- Cleanup commit message to remove redundant text +- Added opp-table property and re-ordered entries +- Clarified power-domains and power-domain-names requirements for RK3588. +- Cleaned up example + +Note: power-domains and power-domain-names requirements for other platforms +are still work in progress, hence the bindings are left incomplete here. + +v2: +- New commit + +Signed-off-by: Liviu Dudau +Cc: Krzysztof Kozlowski +Cc: Rob Herring +Cc: Conor Dooley +Cc: devicetree@vger.kernel.org +Signed-off-by: Boris Brezillon +Reviewed-by: Rob Herring +Acked-by: Maxime Ripard +Acked-by: Heiko Stuebner +--- + Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml | 147 ++++++++++ + 1 file changed, 147 insertions(+) + +diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml b/Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml +new file mode 100644 +index 000000000000..a5b4e0021758 +--- /dev/null ++++ b/Documentation/devicetree/bindings/gpu/arm,mali-valhall-csf.yaml +@@ -0,0 +1,147 @@ ++# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause ++%YAML 1.2 ++--- ++$id: http://devicetree.org/schemas/gpu/arm,mali-valhall-csf.yaml# ++$schema: http://devicetree.org/meta-schemas/core.yaml# ++ ++title: ARM Mali Valhall GPU ++ ++maintainers: ++ - Liviu Dudau ++ - Boris Brezillon ++ ++properties: ++ $nodename: ++ pattern: '^gpu@[a-f0-9]+$' ++ ++ compatible: ++ oneOf: ++ - items: ++ - enum: ++ - rockchip,rk3588-mali ++ - const: arm,mali-valhall-csf # Mali Valhall GPU model/revision is fully discoverable ++ ++ reg: ++ maxItems: 1 ++ ++ interrupts: ++ items: ++ - description: Job interrupt ++ - description: MMU interrupt ++ - description: GPU interrupt ++ ++ interrupt-names: ++ items: ++ - const: job ++ - const: mmu ++ - const: gpu ++ ++ clocks: ++ minItems: 1 ++ maxItems: 3 ++ ++ clock-names: ++ minItems: 1 ++ items: ++ - const: core ++ - const: coregroup ++ - const: stacks ++ ++ mali-supply: true ++ ++ operating-points-v2: true ++ opp-table: ++ type: object ++ ++ power-domains: ++ minItems: 1 ++ maxItems: 5 ++ ++ power-domain-names: ++ minItems: 1 ++ maxItems: 5 ++ ++ sram-supply: true ++ ++ "#cooling-cells": ++ const: 2 ++ ++ dynamic-power-coefficient: ++ $ref: /schemas/types.yaml#/definitions/uint32 ++ description: ++ A u32 value that represents the running time dynamic ++ power coefficient in units of uW/MHz/V^2. The ++ coefficient can either be calculated from power ++ measurements or derived by analysis. ++ ++ The dynamic power consumption of the GPU is ++ proportional to the square of the Voltage (V) and ++ the clock frequency (f). The coefficient is used to ++ calculate the dynamic power as below - ++ ++ Pdyn = dynamic-power-coefficient * V^2 * f ++ ++ where voltage is in V, frequency is in MHz. ++ ++ dma-coherent: true ++ ++required: ++ - compatible ++ - reg ++ - interrupts ++ - interrupt-names ++ - clocks ++ - mali-supply ++ ++additionalProperties: false ++ ++allOf: ++ - if: ++ properties: ++ compatible: ++ contains: ++ const: rockchip,rk3588-mali ++ then: ++ properties: ++ clocks: ++ minItems: 3 ++ power-domains: ++ maxItems: 1 ++ power-domain-names: false ++ ++examples: ++ - | ++ #include ++ #include ++ #include ++ #include ++ ++ gpu: gpu@fb000000 { ++ compatible = "rockchip,rk3588-mali", "arm,mali-valhall-csf"; ++ reg = <0xfb000000 0x200000>; ++ interrupts = , ++ , ++ ; ++ interrupt-names = "job", "mmu", "gpu"; ++ clock-names = "core", "coregroup", "stacks"; ++ clocks = <&cru CLK_GPU>, <&cru CLK_GPU_COREGROUP>, ++ <&cru CLK_GPU_STACKS>; ++ power-domains = <&power RK3588_PD_GPU>; ++ operating-points-v2 = <&gpu_opp_table>; ++ mali-supply = <&vdd_gpu_s0>; ++ sram-supply = <&vdd_gpu_mem_s0>; ++ ++ gpu_opp_table: opp-table { ++ compatible = "operating-points-v2"; ++ opp-300000000 { ++ opp-hz = /bits/ 64 <300000000>; ++ opp-microvolt = <675000 675000 850000>; ++ }; ++ opp-400000000 { ++ opp-hz = /bits/ 64 <400000000>; ++ opp-microvolt = <675000 675000 850000>; ++ }; ++ }; ++ }; ++ ++... +-- +Armbian + diff --git a/patch/kernel/rockchip-rk3588-edge/0042-arm64-dts-rockchip-rk3588-Add-GPU-nodes.patch b/patch/kernel/rockchip-rk3588-edge/0042-arm64-dts-rockchip-rk3588-Add-GPU-nodes.patch new file mode 100644 index 0000000000..6b146a7cdf --- /dev/null +++ b/patch/kernel/rockchip-rk3588-edge/0042-arm64-dts-rockchip-rk3588-Add-GPU-nodes.patch @@ -0,0 +1,150 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Mon, 7 Aug 2023 17:30:58 +0200 +Subject: arm64: dts: rockchip: rk3588: Add GPU nodes + +Signed-off-by: Sebastian Reichel +--- + arch/arm64/boot/dts/rockchip/rk3588s.dtsi | 119 ++++++++++ + 1 file changed, 119 insertions(+) + +diff --git a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi +index 762a095648b1..f43f10340d5d 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3588s.dtsi ++++ b/arch/arm64/boot/dts/rockchip/rk3588s.dtsi +@@ -962,6 +962,120 @@ usb_host2_xhci: usb@fcd00000 { + snps,dis-del-phy-power-chg-quirk; + snps,dis-tx-ipgap-linecheck-quirk; + snps,dis_rxdet_inp3_quirk; ++ }; ++ ++ gpu_opp_table: gpu-opp-table { ++ compatible = "operating-points-v2"; ++ ++ nvmem-cells = <&gpu_leakage>; ++ nvmem-cell-names = "leakage"; ++ ++ rockchip,pvtm-voltage-sel = < ++ 0 815 0 ++ 816 835 1 ++ 836 860 2 ++ 861 885 3 ++ 886 910 4 ++ 911 9999 5 ++ >; ++ rockchip,pvtm-pvtpll; ++ rockchip,pvtm-offset = <0x1c>; ++ rockchip,pvtm-sample-time = <1100>; ++ rockchip,pvtm-freq = <800000>; ++ rockchip,pvtm-volt = <750000>; ++ rockchip,pvtm-ref-temp = <25>; ++ rockchip,pvtm-temp-prop = <(-135) (-135)>; ++ rockchip,pvtm-thermal-zone = "gpu-thermal"; ++ ++ clocks = <&cru CLK_GPU>; ++ clock-names = "clk"; ++ rockchip,grf = <&gpu_grf>; ++ volt-mem-read-margin = < ++ 855000 1 ++ 765000 2 ++ 675000 3 ++ 495000 4 ++ >; ++ low-volt-mem-read-margin = <4>; ++ intermediate-threshold-freq = <400000>; /* KHz */ ++ ++ rockchip,temp-hysteresis = <5000>; ++ rockchip,low-temp = <10000>; ++ rockchip,low-temp-min-volt = <750000>; ++ rockchip,high-temp = <85000>; ++ rockchip,high-temp-max-freq = <800000>; ++ ++ opp-300000000 { ++ opp-hz = /bits/ 64 <300000000>; ++ opp-microvolt = <675000 675000 850000>; ++ }; ++ opp-400000000 { ++ opp-hz = /bits/ 64 <400000000>; ++ opp-microvolt = <675000 675000 850000>; ++ }; ++ opp-500000000 { ++ opp-hz = /bits/ 64 <500000000>; ++ opp-microvolt = <675000 675000 850000>; ++ }; ++ opp-600000000 { ++ opp-hz = /bits/ 64 <600000000>; ++ opp-microvolt = <675000 675000 850000>; ++ }; ++ opp-700000000 { ++ opp-hz = /bits/ 64 <700000000>; ++ opp-microvolt = <700000 700000 850000>; ++ opp-microvolt-L2 = <687500 687500 850000>; ++ opp-microvolt-L3 = <675000 675000 850000>; ++ opp-microvolt-L4 = <675000 675000 850000>; ++ opp-microvolt-L5 = <675000 675000 850000>; ++ }; ++ opp-800000000 { ++ opp-hz = /bits/ 64 <800000000>; ++ opp-microvolt = <750000 750000 850000>; ++ opp-microvolt-L1 = <737500 737500 850000>; ++ opp-microvolt-L2 = <725000 725000 850000>; ++ opp-microvolt-L3 = <712500 712500 850000>; ++ opp-microvolt-L4 = <700000 700000 850000>; ++ opp-microvolt-L5 = <700000 700000 850000>; ++ }; ++ opp-900000000 { ++ opp-hz = /bits/ 64 <900000000>; ++ opp-microvolt = <800000 800000 850000>; ++ opp-microvolt-L1 = <787500 787500 850000>; ++ opp-microvolt-L2 = <775000 775000 850000>; ++ opp-microvolt-L3 = <762500 762500 850000>; ++ opp-microvolt-L4 = <750000 750000 850000>; ++ opp-microvolt-L5 = <737500 737500 850000>; ++ }; ++ opp-1000000000 { ++ opp-hz = /bits/ 64 <1000000000>; ++ opp-microvolt = <850000 850000 850000>; ++ opp-microvolt-L1 = <837500 837500 850000>; ++ opp-microvolt-L2 = <825000 825000 850000>; ++ opp-microvolt-L3 = <812500 812500 850000>; ++ opp-microvolt-L4 = <800000 800000 850000>; ++ opp-microvolt-L5 = <787500 787500 850000>; ++ }; ++ }; ++ ++ gpu: gpu@fb000000 { ++ compatible = "rockchip,rk3588-mali", "arm,mali-valhall-csf"; ++ reg = <0x0 0xfb000000 0x0 0x200000>; ++ interrupts = , ++ , ++ ; ++ interrupt-names = "job", "mmu", "gpu"; ++ ++ clock-names = "core", "coregroup", "stacks"; ++ clocks = <&cru CLK_GPU>, <&cru CLK_GPU_COREGROUP>, ++ <&cru CLK_GPU_STACKS>; ++ assigned-clocks = <&scmi_clk SCMI_CLK_GPU>; ++ assigned-clock-rates = <200000000>; ++ power-domains = <&power RK3588_PD_GPU>; ++ operating-points-v2 = <&gpu_opp_table>; ++ #cooling-cells = <2>; ++ dynamic-power-coefficient = <2982>; ++ + status = "disabled"; + }; + +@@ -3124,6 +3238,11 @@ gpio4: gpio@fec50000 { + }; + }; + ++ gpu_grf: syscon@fd5a0000 { ++ compatible = "rockchip,rk3588-gpu-grf", "syscon"; ++ reg = <0x0 0xfd5a0000 0x0 0x100>; ++ }; ++ + av1d: video-codec@fdc70000 { + compatible = "rockchip,rk3588-av1-vpu"; + reg = <0x0 0xfdc70000 0x0 0x800>; +-- +Armbian + diff --git a/patch/kernel/rockchip-rk3588-edge/0043-arm64-dts-rockchip-rk3588-rock5b-Add-GPU-node.patch b/patch/kernel/rockchip-rk3588-edge/0043-arm64-dts-rockchip-rk3588-rock5b-Add-GPU-node.patch new file mode 100644 index 0000000000..679b735be8 --- /dev/null +++ b/patch/kernel/rockchip-rk3588-edge/0043-arm64-dts-rockchip-rk3588-rock5b-Add-GPU-node.patch @@ -0,0 +1,38 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Boris Brezillon +Date: Tue, 8 Aug 2023 12:05:22 +0200 +Subject: arm64: dts: rockchip: rk3588-rock5b: Add GPU node + +Signed-off-by: Boris Brezillon +Signed-off-by: Sebastian Reichel +--- + arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts +index 98192a35920c..7cb5874f3ec2 100644 +--- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts ++++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b.dts +@@ -182,6 +182,11 @@ &cpu_l3 { + mem-supply = <&vdd_cpu_lit_mem_s0>; + }; + ++&gpu { ++ mali-supply = <&vdd_gpu_s0>; ++ status = "okay"; ++}; ++ + &hdmi0 { + status = "okay"; + }; +@@ -490,6 +495,7 @@ rk806_dvs3_null: dvs3-null-pins { + + regulators { + vdd_gpu_s0: vdd_gpu_mem_s0: dcdc-reg1 { ++ regulator-always-on; + regulator-boot-on; + regulator-min-microvolt = <550000>; + regulator-max-microvolt = <950000>; +-- +Armbian +