 6eb85e846f
			
		
	
	
		6eb85e846f
		
			
		
	
	
	
	
		
			
			* experimental changes to try and reduce allocations in kernel threading and DMA handler * Simplify the changes in this branch to just 1. Don't make unnecessary copies of data just for texture-texture transfers and 2. Add a fast path for 1bpp linear byte copies * forgot to check src + dst linearity in 1bpp DMA fast path. Fixes the UE4 regression. * removing dev log I left in * Generalizing the DMA linear fast path to cases other than 1bpp copies * revert kernel changes * revert whitespace * remove unneeded references * PR feedback Co-authored-by: Logan Stromberg <lostromb@microsoft.com> Co-authored-by: gdk <gab.dark.100@gmail.com>
		
			
				
	
	
		
			451 lines
		
	
	
	
		
			19 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
			
		
		
	
	
			451 lines
		
	
	
	
		
			19 KiB
		
	
	
	
		
			C#
		
	
	
	
	
	
| using Ryujinx.Common;
 | |
| using Ryujinx.Graphics.Device;
 | |
| using Ryujinx.Graphics.Gpu.Engine.Threed;
 | |
| using Ryujinx.Graphics.Gpu.Memory;
 | |
| using Ryujinx.Graphics.Texture;
 | |
| using System;
 | |
| using System.Collections.Generic;
 | |
| using System.Runtime.CompilerServices;
 | |
| using System.Runtime.Intrinsics;
 | |
| 
 | |
| namespace Ryujinx.Graphics.Gpu.Engine.Dma
 | |
| {
 | |
|     /// <summary>
 | |
|     /// Represents a DMA copy engine class.
 | |
|     /// </summary>
 | |
|     class DmaClass : IDeviceState
 | |
|     {
 | |
|         private readonly GpuContext _context;
 | |
|         private readonly GpuChannel _channel;
 | |
|         private readonly ThreedClass _3dEngine;
 | |
|         private readonly DeviceState<DmaClassState> _state;
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Copy flags passed on DMA launch.
 | |
|         /// </summary>
 | |
|         [Flags]
 | |
|         private enum CopyFlags
 | |
|         {
 | |
|             SrcLinear = 1 << 7,
 | |
|             DstLinear = 1 << 8,
 | |
|             MultiLineEnable = 1 << 9,
 | |
|             RemapEnable = 1 << 10
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Creates a new instance of the DMA copy engine class.
 | |
|         /// </summary>
 | |
|         /// <param name="context">GPU context</param>
 | |
|         /// <param name="channel">GPU channel</param>
 | |
|         /// <param name="threedEngine">3D engine</param>
 | |
|         public DmaClass(GpuContext context, GpuChannel channel, ThreedClass threedEngine)
 | |
|         {
 | |
|             _context = context;
 | |
|             _channel = channel;
 | |
|             _3dEngine = threedEngine;
 | |
|             _state = new DeviceState<DmaClassState>(new Dictionary<string, RwCallback>
 | |
|             {
 | |
|                 { nameof(DmaClassState.LaunchDma), new RwCallback(LaunchDma, null) }
 | |
|             });
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Reads data from the class registers.
 | |
|         /// </summary>
 | |
|         /// <param name="offset">Register byte offset</param>
 | |
|         /// <returns>Data at the specified offset</returns>
 | |
|         public int Read(int offset) => _state.Read(offset);
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Writes data to the class registers.
 | |
|         /// </summary>
 | |
|         /// <param name="offset">Register byte offset</param>
 | |
|         /// <param name="data">Data to be written</param>
 | |
|         public void Write(int offset, int data) => _state.Write(offset, data);
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Determine if a buffer-to-texture region covers the entirety of a texture.
 | |
|         /// </summary>
 | |
|         /// <param name="tex">Texture to compare</param>
 | |
|         /// <param name="linear">True if the texture is linear, false if block linear</param>
 | |
|         /// <param name="bpp">Texture bytes per pixel</param>
 | |
|         /// <param name="stride">Texture stride</param>
 | |
|         /// <param name="xCount">Number of pixels to be copied</param>
 | |
|         /// <param name="yCount">Number of lines to be copied</param>
 | |
|         /// <returns></returns>
 | |
|         private static bool IsTextureCopyComplete(DmaTexture tex, bool linear, int bpp, int stride, int xCount, int yCount)
 | |
|         {
 | |
|             if (linear)
 | |
|             {
 | |
|                 // If the stride is negative, the texture has to be flipped, so
 | |
|                 // the fast copy is not trivial, use the slow path.
 | |
|                 if (stride <= 0)
 | |
|                 {
 | |
|                     return false;
 | |
|                 }
 | |
| 
 | |
|                 int alignWidth = Constants.StrideAlignment / bpp;
 | |
|                 return stride / bpp == BitUtils.AlignUp(xCount, alignWidth);
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 int alignWidth = Constants.GobAlignment / bpp;
 | |
|                 return tex.RegionX == 0 &&
 | |
|                        tex.RegionY == 0 &&
 | |
|                        tex.Width == BitUtils.AlignUp(xCount, alignWidth) &&
 | |
|                        tex.Height == yCount;
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Releases a semaphore for a given LaunchDma method call.
 | |
|         /// </summary>
 | |
|         /// <param name="argument">The LaunchDma call argument</param>
 | |
|         private void ReleaseSemaphore(int argument)
 | |
|         {
 | |
|             LaunchDmaSemaphoreType type = (LaunchDmaSemaphoreType)((argument >> 3) & 0x3);
 | |
|             if (type != LaunchDmaSemaphoreType.None)
 | |
|             {
 | |
|                 ulong address = ((ulong)_state.State.SetSemaphoreA << 32) | _state.State.SetSemaphoreB;
 | |
|                 if (type == LaunchDmaSemaphoreType.ReleaseOneWordSemaphore)
 | |
|                 {
 | |
|                     _channel.MemoryManager.Write(address, _state.State.SetSemaphorePayload);
 | |
|                 }
 | |
|                 else /* if (type == LaunchDmaSemaphoreType.ReleaseFourWordSemaphore) */
 | |
|                 {
 | |
|                     _channel.MemoryManager.Write(address + 8, _context.GetTimestamp());
 | |
|                     _channel.MemoryManager.Write(address, (ulong)_state.State.SetSemaphorePayload);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Performs a buffer to buffer, or buffer to texture copy.
 | |
|         /// </summary>
 | |
|         /// <param name="argument">The LaunchDma call argument</param>
 | |
|         private void DmaCopy(int argument)
 | |
|         {
 | |
|             var memoryManager = _channel.MemoryManager;
 | |
| 
 | |
|             CopyFlags copyFlags = (CopyFlags)argument;
 | |
| 
 | |
|             bool srcLinear = copyFlags.HasFlag(CopyFlags.SrcLinear);
 | |
|             bool dstLinear = copyFlags.HasFlag(CopyFlags.DstLinear);
 | |
|             bool copy2D = copyFlags.HasFlag(CopyFlags.MultiLineEnable);
 | |
|             bool remap = copyFlags.HasFlag(CopyFlags.RemapEnable);
 | |
| 
 | |
|             uint size = _state.State.LineLengthIn;
 | |
| 
 | |
|             if (size == 0)
 | |
|             {
 | |
|                 return;
 | |
|             }
 | |
| 
 | |
|             ulong srcGpuVa = ((ulong)_state.State.OffsetInUpperUpper << 32) | _state.State.OffsetInLower;
 | |
|             ulong dstGpuVa = ((ulong)_state.State.OffsetOutUpperUpper << 32) | _state.State.OffsetOutLower;
 | |
| 
 | |
|             int xCount = (int)_state.State.LineLengthIn;
 | |
|             int yCount = (int)_state.State.LineCount;
 | |
| 
 | |
|             _3dEngine.FlushUboDirty();
 | |
| 
 | |
|             if (copy2D)
 | |
|             {
 | |
|                 // Buffer to texture copy.
 | |
|                 int componentSize = (int)_state.State.SetRemapComponentsComponentSize + 1;
 | |
|                 int srcBpp = remap ? ((int)_state.State.SetRemapComponentsNumSrcComponents + 1) * componentSize : 1;
 | |
|                 int dstBpp = remap ? ((int)_state.State.SetRemapComponentsNumDstComponents + 1) * componentSize : 1;
 | |
| 
 | |
|                 var dst = Unsafe.As<uint, DmaTexture>(ref _state.State.SetDstBlockSize);
 | |
|                 var src = Unsafe.As<uint, DmaTexture>(ref _state.State.SetSrcBlockSize);
 | |
| 
 | |
|                 int srcRegionX = 0, srcRegionY = 0, dstRegionX = 0, dstRegionY = 0;
 | |
| 
 | |
|                 if (!srcLinear)
 | |
|                 {
 | |
|                     srcRegionX = src.RegionX;
 | |
|                     srcRegionY = src.RegionY;
 | |
|                 }
 | |
| 
 | |
|                 if (!dstLinear)
 | |
|                 {
 | |
|                     dstRegionX = dst.RegionX;
 | |
|                     dstRegionY = dst.RegionY;
 | |
|                 }
 | |
| 
 | |
|                 int srcStride = (int)_state.State.PitchIn;
 | |
|                 int dstStride = (int)_state.State.PitchOut;
 | |
| 
 | |
|                 var srcCalculator = new OffsetCalculator(
 | |
|                     src.Width,
 | |
|                     src.Height,
 | |
|                     srcStride,
 | |
|                     srcLinear,
 | |
|                     src.MemoryLayout.UnpackGobBlocksInY(),
 | |
|                     src.MemoryLayout.UnpackGobBlocksInZ(),
 | |
|                     srcBpp);
 | |
| 
 | |
|                 var dstCalculator = new OffsetCalculator(
 | |
|                     dst.Width,
 | |
|                     dst.Height,
 | |
|                     dstStride,
 | |
|                     dstLinear,
 | |
|                     dst.MemoryLayout.UnpackGobBlocksInY(),
 | |
|                     dst.MemoryLayout.UnpackGobBlocksInZ(),
 | |
|                     dstBpp);
 | |
| 
 | |
|                 (int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(srcRegionX, srcRegionY, xCount, yCount);
 | |
|                 (int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dstRegionX, dstRegionY, xCount, yCount);
 | |
| 
 | |
|                 if (srcLinear && srcStride < 0)
 | |
|                 {
 | |
|                     srcBaseOffset += srcStride * (yCount - 1);
 | |
|                 }
 | |
| 
 | |
|                 if (dstLinear && dstStride < 0)
 | |
|                 {
 | |
|                     dstBaseOffset += dstStride * (yCount - 1);
 | |
|                 }
 | |
| 
 | |
|                 ReadOnlySpan<byte> srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true);
 | |
| 
 | |
|                 bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount);
 | |
|                 bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount);
 | |
| 
 | |
|                 if (completeSource && completeDest)
 | |
|                 {
 | |
|                     var target = memoryManager.Physical.TextureCache.FindTexture(
 | |
|                         memoryManager,
 | |
|                         dst,
 | |
|                         dstGpuVa,
 | |
|                         dstBpp,
 | |
|                         dstStride,
 | |
|                         xCount,
 | |
|                         yCount,
 | |
|                         dstLinear);
 | |
| 
 | |
|                     if (target != null)
 | |
|                     {
 | |
|                         ReadOnlySpan<byte> data;
 | |
|                         if (srcLinear)
 | |
|                         {
 | |
|                             data = LayoutConverter.ConvertLinearStridedToLinear(
 | |
|                                 target.Info.Width,
 | |
|                                 target.Info.Height,
 | |
|                                 1,
 | |
|                                 1,
 | |
|                                 xCount * srcBpp,
 | |
|                                 srcStride,
 | |
|                                 target.Info.FormatInfo.BytesPerPixel,
 | |
|                                 srcSpan);
 | |
|                         }
 | |
|                         else
 | |
|                         {
 | |
|                             data = LayoutConverter.ConvertBlockLinearToLinear(
 | |
|                                 src.Width,
 | |
|                                 src.Height,
 | |
|                                 src.Depth,
 | |
|                                 1,
 | |
|                                 1,
 | |
|                                 1,
 | |
|                                 1,
 | |
|                                 1,
 | |
|                                 srcBpp,
 | |
|                                 src.MemoryLayout.UnpackGobBlocksInY(),
 | |
|                                 src.MemoryLayout.UnpackGobBlocksInZ(),
 | |
|                                 1,
 | |
|                                 new SizeInfo((int)target.Size),
 | |
|                                 srcSpan);
 | |
|                         }
 | |
| 
 | |
|                         target.SynchronizeMemory();
 | |
|                         target.SetData(data);
 | |
|                         target.SignalModified();
 | |
|                         return;
 | |
|                     }
 | |
|                     else if (srcCalculator.LayoutMatches(dstCalculator))
 | |
|                     {
 | |
|                         // No layout conversion has to be performed, just copy the data entirely.
 | |
|                         memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan);
 | |
|                         return;
 | |
|                     }
 | |
|                 }
 | |
| 
 | |
|                 unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
 | |
|                 {
 | |
|                     if (srcLinear && dstLinear && srcBpp == dstBpp)
 | |
|                     {
 | |
|                         // Optimized path for purely linear copies - we don't need to calculate every single byte offset,
 | |
|                         // and we can make use of Span.CopyTo which is very very fast (even compared to pointers)
 | |
|                         for (int y = 0; y < yCount; y++)
 | |
|                         {
 | |
|                             srcCalculator.SetY(srcRegionY + y);
 | |
|                             dstCalculator.SetY(dstRegionY + y);
 | |
|                             int srcOffset = srcCalculator.GetOffset(srcRegionX);
 | |
|                             int dstOffset = dstCalculator.GetOffset(dstRegionX);
 | |
|                             srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp)
 | |
|                                 .CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp));
 | |
|                         }
 | |
|                     }
 | |
|                     else
 | |
|                     {
 | |
|                         fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
 | |
|                         {
 | |
|                             byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
 | |
|                             byte* srcBase = srcPtr - srcBaseOffset;
 | |
| 
 | |
|                             for (int y = 0; y < yCount; y++)
 | |
|                             {
 | |
|                                 srcCalculator.SetY(srcRegionY + y);
 | |
|                                 dstCalculator.SetY(dstRegionY + y);
 | |
| 
 | |
|                                 for (int x = 0; x < xCount; x++)
 | |
|                                 {
 | |
|                                     int srcOffset = srcCalculator.GetOffset(srcRegionX + x);
 | |
|                                     int dstOffset = dstCalculator.GetOffset(dstRegionX + x);
 | |
| 
 | |
|                                     *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
 | |
|                                 }
 | |
|                             }
 | |
|                         }
 | |
|                     }
 | |
| 
 | |
|                     return true;
 | |
|                 }
 | |
| 
 | |
|                 // OPT: This allocates a (potentially) huge temporary array and then copies an existing
 | |
|                 // region of memory into it, data that might get overwritten entirely anyways. Ideally this should
 | |
|                 // all be rewritten to use pooled arrays, but that gets complicated with packed data and strides
 | |
|                 Span<byte> dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray();
 | |
| 
 | |
|                 bool _ = srcBpp switch
 | |
|                 {
 | |
|                     1 => Convert<byte>(dstSpan, srcSpan),
 | |
|                     2 => Convert<ushort>(dstSpan, srcSpan),
 | |
|                     4 => Convert<uint>(dstSpan, srcSpan),
 | |
|                     8 => Convert<ulong>(dstSpan, srcSpan),
 | |
|                     12 => Convert<Bpp12Pixel>(dstSpan, srcSpan),
 | |
|                     16 => Convert<Vector128<byte>>(dstSpan, srcSpan),
 | |
|                     _ => throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format.")
 | |
|                 };
 | |
| 
 | |
|                 memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan);
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 if (remap &&
 | |
|                     _state.State.SetRemapComponentsDstX == SetRemapComponentsDst.ConstA &&
 | |
|                     _state.State.SetRemapComponentsDstY == SetRemapComponentsDst.ConstA &&
 | |
|                     _state.State.SetRemapComponentsDstZ == SetRemapComponentsDst.ConstA &&
 | |
|                     _state.State.SetRemapComponentsDstW == SetRemapComponentsDst.ConstA &&
 | |
|                     _state.State.SetRemapComponentsNumSrcComponents == SetRemapComponentsNumComponents.One &&
 | |
|                     _state.State.SetRemapComponentsNumDstComponents == SetRemapComponentsNumComponents.One &&
 | |
|                     _state.State.SetRemapComponentsComponentSize == SetRemapComponentsComponentSize.Four)
 | |
|                 {
 | |
|                     // Fast path for clears when remap is enabled.
 | |
|                     memoryManager.Physical.BufferCache.ClearBuffer(memoryManager, dstGpuVa, size * 4, _state.State.SetRemapConstA);
 | |
|                 }
 | |
|                 else
 | |
|                 {
 | |
|                     // TODO: Implement remap functionality.
 | |
|                     // Buffer to buffer copy.
 | |
| 
 | |
|                     bool srcIsPitchKind = memoryManager.GetKind(srcGpuVa).IsPitch();
 | |
|                     bool dstIsPitchKind = memoryManager.GetKind(dstGpuVa).IsPitch();
 | |
| 
 | |
|                     if (!srcIsPitchKind && dstIsPitchKind)
 | |
|                     {
 | |
|                         CopyGobBlockLinearToLinear(memoryManager, srcGpuVa, dstGpuVa, size);
 | |
|                     }
 | |
|                     else if (srcIsPitchKind && !dstIsPitchKind)
 | |
|                     {
 | |
|                         CopyGobLinearToBlockLinear(memoryManager, srcGpuVa, dstGpuVa, size);
 | |
|                     }
 | |
|                     else
 | |
|                     {
 | |
|                         memoryManager.Physical.BufferCache.CopyBuffer(memoryManager, srcGpuVa, dstGpuVa, size);
 | |
|                     }
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Copies block linear data with block linear GOBs to a block linear destination with linear GOBs.
 | |
|         /// </summary>
 | |
|         /// <param name="memoryManager">GPU memory manager</param>
 | |
|         /// <param name="srcGpuVa">Source GPU virtual address</param>
 | |
|         /// <param name="dstGpuVa">Destination GPU virtual address</param>
 | |
|         /// <param name="size">Size in bytes of the copy</param>
 | |
|         private static void CopyGobBlockLinearToLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size)
 | |
|         {
 | |
|             if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0)
 | |
|             {
 | |
|                 for (ulong offset = 0; offset < size; offset += 16)
 | |
|                 {
 | |
|                     Vector128<byte> data = memoryManager.Read<Vector128<byte>>(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true);
 | |
|                     memoryManager.Write(dstGpuVa + offset, data);
 | |
|                 }
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 for (ulong offset = 0; offset < size; offset++)
 | |
|                 {
 | |
|                     byte data = memoryManager.Read<byte>(ConvertGobLinearToBlockLinearAddress(srcGpuVa + offset), true);
 | |
|                     memoryManager.Write(dstGpuVa + offset, data);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Copies block linear data with linear GOBs to a block linear destination with block linear GOBs.
 | |
|         /// </summary>
 | |
|         /// <param name="memoryManager">GPU memory manager</param>
 | |
|         /// <param name="srcGpuVa">Source GPU virtual address</param>
 | |
|         /// <param name="dstGpuVa">Destination GPU virtual address</param>
 | |
|         /// <param name="size">Size in bytes of the copy</param>
 | |
|         private static void CopyGobLinearToBlockLinear(MemoryManager memoryManager, ulong srcGpuVa, ulong dstGpuVa, ulong size)
 | |
|         {
 | |
|             if (((srcGpuVa | dstGpuVa | size) & 0xf) == 0)
 | |
|             {
 | |
|                 for (ulong offset = 0; offset < size; offset += 16)
 | |
|                 {
 | |
|                     Vector128<byte> data = memoryManager.Read<Vector128<byte>>(srcGpuVa + offset, true);
 | |
|                     memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data);
 | |
|                 }
 | |
|             }
 | |
|             else
 | |
|             {
 | |
|                 for (ulong offset = 0; offset < size; offset++)
 | |
|                 {
 | |
|                     byte data = memoryManager.Read<byte>(srcGpuVa + offset, true);
 | |
|                     memoryManager.Write(ConvertGobLinearToBlockLinearAddress(dstGpuVa + offset), data);
 | |
|                 }
 | |
|             }
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Calculates the GOB block linear address from a linear address.
 | |
|         /// </summary>
 | |
|         /// <param name="address">Linear address</param>
 | |
|         /// <returns>Block linear address</returns>
 | |
|         private static ulong ConvertGobLinearToBlockLinearAddress(ulong address)
 | |
|         {
 | |
|             // y2 y1 y0 x5 x4 x3 x2 x1 x0 -> x5 y2 y1 x4 y0 x3 x2 x1 x0
 | |
|             return (address & ~0x1f0UL) |
 | |
|                 ((address & 0x40) >> 2) |
 | |
|                 ((address & 0x10) << 1) |
 | |
|                 ((address & 0x180) >> 1) |
 | |
|                 ((address & 0x20) << 3);
 | |
|         }
 | |
| 
 | |
|         /// <summary>
 | |
|         /// Performs a buffer to buffer, or buffer to texture copy, then optionally releases a semaphore.
 | |
|         /// </summary>
 | |
|         /// <param name="argument">Method call argument</param>
 | |
|         private void LaunchDma(int argument)
 | |
|         {
 | |
|             DmaCopy(argument);
 | |
|             ReleaseSemaphore(argument);
 | |
|         }
 | |
|     }
 | |
| }
 |