Revert "Android: Implement TLB optimization to prevent deadlocks and improve performance"

This reverts commit 21594b73aa.
2025-03-17 12:20:38 +10:00 · 2025-03-17 12:20:38 +10:00 · 8cb6e6d5d4
parent 51800e249b
commit 8cb6e6d5d4
5 changed files with 2 additions and 155 deletions
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@ -1,5 +1,4 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <algorithm>
@ -328,11 +327,8 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& m
                                       DescriptorPool& descriptor_pool)
    : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
      staging_pool{staging_pool_}, guest_descriptor_queue{guest_descriptor_queue_},
      accelerate{nullptr},
      quad_index_pass(device, scheduler, descriptor_pool, staging_pool,
                      compute_pass_descriptor_queue) {
    accelerate = new BufferCacheAccelerator();
    if (device.GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY) {
        // TODO: FixMe: Uint8Pass compute shader does not build on some Qualcomm drivers.
        uint8_pass = std::make_unique<Uint8Pass>(device, scheduler, descriptor_pool, staging_pool,
@ -673,30 +669,4 @@ vk::Buffer BufferCacheRuntime::CreateNullBuffer() {
    return ret;
 }
 void BufferCacheRuntime::InsertTLBBarrierImpl() {
 #ifdef ANDROID
    // Create a memory barrier specifically optimized for TLB coherency
    // This helps prevent Android-specific deadlocks by ensuring proper
    // GPU<->GPU memory coherency without a full pipeline stall
    static constexpr VkMemoryBarrier TLB_BARRIER{
        .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
        .pNext = nullptr,
        .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
        .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
    };
    scheduler.RequestOutsideRenderPassOperationContext();
    scheduler.Record([](vk::CommandBuffer cmdbuf) {
        cmdbuf.PipelineBarrier(
            VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
            VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
            0, TLB_BARRIER, {}, {});
    });
 #endif
 }
 BufferCacheRuntime::~BufferCacheRuntime() {
    delete accelerate;
 }
 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@ -1,5 +1,4 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 #pragma once
@ -23,21 +22,6 @@ class Scheduler;
 struct HostVertexBinding;
 class BufferCacheRuntime;
 class BufferCacheAccelerator;
 struct OverlapResult {
    bool has_stream_buffer;
    bool has_written_buffer;
 };
 class BufferCacheAccelerator {
 public:
    OverlapResult CheckRangeOverlaps(DAddr addr, u64 size) {
        // Simple implementation - assume there are overlaps
        // This can be expanded with actual buffer tracking if needed
        return OverlapResult{true, true};
    }
 };
 class Buffer : public VideoCommon::BufferBase {
 public:
@ -96,7 +80,6 @@ public:
                                GuestDescriptorQueue& guest_descriptor_queue,
                                ComputePassDescriptorQueue& compute_pass_descriptor_queue,
                                DescriptorPool& descriptor_pool);
    ~BufferCacheRuntime();
    void TickFrame(Common::SlotVector<Buffer>& slot_buffers) noexcept;
@ -162,22 +145,6 @@ public:
        guest_descriptor_queue.AddTexelBuffer(buffer.View(offset, size, format));
    }
    /// TLB-aware memory barrier to prevent deadlocks, particularly on Android
    void InsertTLBBarrier(DAddr addr, u64 size) {
        // This provides a more precise way to synchronize memory
        // without causing unnecessary TLB invalidations
 #ifdef ANDROID
        std::scoped_lock lock{mutex};
        OverlapResult result = accelerate->CheckRangeOverlaps(addr, size);
        if (!result.has_stream_buffer && !result.has_written_buffer) {
            // If no overlap with active memory, skip barrier to maintain TLB entries
            return;
        }
        InsertTLBBarrierImpl();
 #endif
    }
 private:
    void BindBuffer(VkBuffer buffer, u32 offset, u32 size) {
        guest_descriptor_queue.AddBuffer(buffer, offset, size);
@ -185,7 +152,6 @@ private:
    void ReserveNullBuffer();
    vk::Buffer CreateNullBuffer();
    void InsertTLBBarrierImpl();
    const Device& device;
    MemoryAllocator& memory_allocator;
@ -198,9 +164,6 @@ private:
    vk::Buffer null_buffer;
    std::mutex mutex;
    BufferCacheAccelerator* accelerate;
    std::unique_ptr<Uint8Pass> uint8_pass;
    QuadIndexedPass quad_index_pass;
 };
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -718,34 +718,7 @@ void RasterizerVulkan::FlushAndInvalidateRegion(DAddr addr, u64 size,
    if (Settings::IsGPULevelExtreme()) {
        FlushRegion(addr, size, which);
    }
-
+    InvalidateRegion(addr, size, which);
    // TLB optimization to avoid redundant flushing and potential deadlocks
    static constexpr size_t TLB_CACHE_SIZE = 128;
    static std::array<std::pair<DAddr, u64>, TLB_CACHE_SIZE> tlb_cache;
    static size_t tlb_cache_index = 0;
    static std::mutex tlb_mutex;
    {
        std::scoped_lock lock{tlb_mutex};
        // Check if this region is already in our TLB cache
        bool found_in_tlb = false;
        for (const auto& entry : tlb_cache) {
            if (entry.first <= addr && addr + size <= entry.first + entry.second) {
                // This region is already in our TLB cache, no need to flush
                found_in_tlb = true;
                break;
            }
        }
        if (!found_in_tlb) {
            // Add to TLB cache
            tlb_cache[tlb_cache_index] = {addr, size};
            tlb_cache_index = (tlb_cache_index + 1) % TLB_CACHE_SIZE;
            // Proceed with normal invalidation
            InvalidateRegion(addr, size, which);
        }
    }
 }
 void RasterizerVulkan::WaitForIdle() {
@ -875,18 +848,6 @@ void RasterizerVulkan::LoadDiskResources(u64 title_id, std::stop_token stop_load
 void RasterizerVulkan::FlushWork() {
 #ifdef ANDROID
    static constexpr u32 DRAWS_TO_DISPATCH = 1024;
    // Android-specific TLB optimization to prevent deadlocks
    // This limits the maximum number of outstanding memory operations to avoid TLB thrashing
    static constexpr u32 MAX_TLB_OPERATIONS = 64;
    static u32 tlb_operation_counter = 0;
    if (++tlb_operation_counter >= MAX_TLB_OPERATIONS) {
        // Force a flush to ensure memory operations complete
        scheduler.Flush();
        scheduler.WaitIdle();  // Make sure all operations complete to clear TLB state
        tlb_operation_counter = 0;
    }
 #else
    static constexpr u32 DRAWS_TO_DISPATCH = 4096;
 #endif // ANDROID
--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@ -1,5 +1,4 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <memory>
@ -282,24 +281,6 @@ void Scheduler::EndPendingOperations() {
        // This is problematic on Android, disable on GPU Normal.
        // query_cache->DisableStreams();
    }
    // Add TLB-aware memory barrier handling for Android
    // This reduces the likelihood of deadlocks due to memory stalls
    static constexpr VkMemoryBarrier TLB_OPTIMIZED_BARRIER{
        .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
        .pNext = nullptr,
        .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
        // Only use necessary access flags to avoid full TLB flush
        .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT,
    };
    Record([barrier = TLB_OPTIMIZED_BARRIER](vk::CommandBuffer cmdbuf) {
        // Use a more specific pipeline stage for better performance
        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
                              VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
                              VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
                              0, barrier);
    });
 #else
    // query_cache->DisableStreams();
 #endif
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@ -1677,35 +1677,7 @@ bool TextureCacheRuntime::CanReportMemoryUsage() const {
    return device.CanReportMemoryUsage();
 }
-void TextureCacheRuntime::TickFrame() {
+void TextureCacheRuntime::TickFrame() {}
    // Implement TLB prefetching for better memory access patterns
    // This helps avoid the 0.0 FPS deadlock issues on Android
    static std::vector<VkDeviceSize> tlb_prefetch_offsets;
    static std::vector<VkDeviceSize> tlb_prefetch_sizes;
    static std::vector<VkImageMemoryBarrier> tlb_prefetch_barriers;
    // Clear previous frame's data
    tlb_prefetch_offsets.clear();
    tlb_prefetch_sizes.clear();
    tlb_prefetch_barriers.clear();
 #ifdef ANDROID
    // Prefetch commonly accessed texture memory regions
    // This helps the TLB maintain a more stable state and prevents cache thrashing
    scheduler.RequestOutsideRenderPassOperationContext();
    scheduler.Record([this](vk::CommandBuffer cmdbuf) {
        if (!tlb_prefetch_barriers.empty()) {
            cmdbuf.PipelineBarrier(
                VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
                VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT,
                0,
                vk::Span<VkMemoryBarrier>{},
                vk::Span<VkBufferMemoryBarrier>{},
                vk::Span(tlb_prefetch_barriers.data(), tlb_prefetch_barriers.size()));
        }
    });
 #endif
 }
 Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_,
             VAddr cpu_addr_)