using Ryujinx.Common;
using Ryujinx.Graphics.Device;
using Ryujinx.Graphics.Texture;
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
namespace Ryujinx.Graphics.Gpu.Engine.InlineToMemory
{
    /// 
    /// Represents a Inline-to-Memory engine class.
    /// 
    class InlineToMemoryClass : IDeviceState
    {
        private readonly GpuContext _context;
        private readonly GpuChannel _channel;
        private readonly DeviceState _state;
        private bool _isLinear;
        private int _offset;
        private int _size;
        private ulong _dstGpuVa;
        private int _dstX;
        private int _dstY;
        private int _dstWidth;
        private int _dstHeight;
        private int _dstStride;
        private int _dstGobBlocksInY;
        private int _dstGobBlocksInZ;
        private int _lineLengthIn;
        private int _lineCount;
        private bool _finished;
        private int[] _buffer;
        /// 
        /// Creates a new instance of the Inline-to-Memory engine class.
        /// 
        /// GPU context
        /// GPU channel
        /// Indicates if the internal state should be initialized. Set to false if part of another engine
        public InlineToMemoryClass(GpuContext context, GpuChannel channel, bool initializeState)
        {
            _context = context;
            _channel = channel;
            if (initializeState)
            {
                _state = new DeviceState(new Dictionary
                {
                    { nameof(InlineToMemoryClassState.LaunchDma), new RwCallback(LaunchDma, null) },
                    { nameof(InlineToMemoryClassState.LoadInlineData), new RwCallback(LoadInlineData, null) }
                });
            }
        }
        /// 
        /// Creates a new instance of the inline-to-memory engine class.
        /// 
        /// GPU context
        /// GPU channel
        public InlineToMemoryClass(GpuContext context, GpuChannel channel) : this(context, channel, true)
        {
        }
        /// 
        /// Reads data from the class registers.
        /// 
        /// Register byte offset
        /// Data at the specified offset
        public int Read(int offset) => _state.Read(offset);
        /// 
        /// Writes data to the class registers.
        /// 
        /// Register byte offset
        /// Data to be written
        public void Write(int offset, int data) => _state.Write(offset, data);
        /// 
        /// Launches Inline-to-Memory engine DMA copy.
        /// 
        /// Method call argument
        private void LaunchDma(int argument)
        {
            LaunchDma(ref _state.State, argument);
        }
        /// 
        /// Launches Inline-to-Memory engine DMA copy.
        /// 
        /// Current class state
        /// Method call argument
        public void LaunchDma(ref InlineToMemoryClassState state, int argument)
        {
            _isLinear = (argument & 1) != 0;
            _offset = 0;
            _size = (int)(BitUtils.AlignUp(state.LineLengthIn, 4) * state.LineCount);
            int count = _size / 4;
            if (_buffer == null || _buffer.Length < count)
            {
                _buffer = new int[count];
            }
            ulong dstGpuVa = ((ulong)state.OffsetOutUpperValue << 32) | state.OffsetOut;
            _dstGpuVa = dstGpuVa;
            _dstX = state.SetDstOriginBytesXV;
            _dstY = state.SetDstOriginSamplesYV;
            _dstWidth = (int)state.SetDstWidth;
            _dstHeight = (int)state.SetDstHeight;
            _dstStride = (int)state.PitchOut;
            _dstGobBlocksInY = 1 << (int)state.SetDstBlockSizeHeight;
            _dstGobBlocksInZ = 1 << (int)state.SetDstBlockSizeDepth;
            _lineLengthIn = (int)state.LineLengthIn;
            _lineCount = (int)state.LineCount;
            _finished = false;
        }
        /// 
        /// Pushes a block of data to the Inline-to-Memory engine.
        /// 
        /// Data to push
        public void LoadInlineData(ReadOnlySpan data)
        {
            if (!_finished)
            {
                int copySize = Math.Min(data.Length, _buffer.Length - _offset);
                data.Slice(0, copySize).CopyTo(new Span(_buffer).Slice(_offset, copySize));
                _offset += copySize;
                if (_offset * 4 >= _size)
                {
                    FinishTransfer();
                }
            }
        }
        /// 
        /// Pushes a word of data to the Inline-to-Memory engine.
        /// 
        /// Method call argument
        public void LoadInlineData(int argument)
        {
            if (!_finished)
            {
                _buffer[_offset++] = argument;
                if (_offset * 4 >= _size)
                {
                    FinishTransfer();
                }
            }
        }
        /// 
        /// Performs actual copy of the inline data after the transfer is finished.
        /// 
        private void FinishTransfer()
        {
            var memoryManager = _channel.MemoryManager;
            var data = MemoryMarshal.Cast(_buffer).Slice(0, _size);
            if (_isLinear && _lineCount == 1)
            {
                memoryManager.WriteTrackedResource(_dstGpuVa, data.Slice(0, _lineLengthIn));
                _context.AdvanceSequence();
            }
            else
            {
                // TODO: Verify if the destination X/Y and width/height are taken into account
                // for linear texture transfers. If not, we can use the fast path for that aswell.
                // Right now the copy code at the bottom assumes that it is used on both which might be incorrect.
                if (!_isLinear)
                {
                    var target = memoryManager.Physical.TextureCache.FindTexture(
                        memoryManager,
                        _dstGpuVa,
                        1,
                        _dstStride,
                        _dstHeight,
                        _lineLengthIn,
                        _lineCount,
                        _isLinear,
                        _dstGobBlocksInY,
                        _dstGobBlocksInZ);
                    if (target != null)
                    {
                        target.SynchronizeMemory();
                        target.SetData(data, 0, 0, new GAL.Rectangle(_dstX, _dstY, _lineLengthIn / target.Info.FormatInfo.BytesPerPixel, _lineCount));
                        target.SignalModified();
                        return;
                    }
                }
                var dstCalculator = new OffsetCalculator(
                    _dstWidth,
                    _dstHeight,
                    _dstStride,
                    _isLinear,
                    _dstGobBlocksInY,
                    1);
                int srcOffset = 0;
                for (int y = _dstY; y < _dstY + _lineCount; y++)
                {
                    int x1 = _dstX;
                    int x2 = _dstX + _lineLengthIn;
                    int x1Round = BitUtils.AlignUp(_dstX, 16);
                    int x2Trunc = BitUtils.AlignDown(x2, 16);
                    int x = x1;
                    if (x1Round <= x2)
                    {
                        for (; x < x1Round; x++, srcOffset++)
                        {
                            int dstOffset = dstCalculator.GetOffset(x, y);
                            ulong dstAddress = _dstGpuVa + (uint)dstOffset;
                            memoryManager.Write(dstAddress, data[srcOffset]);
                        }
                    }
                    for (; x < x2Trunc; x += 16, srcOffset += 16)
                    {
                        int dstOffset = dstCalculator.GetOffset(x, y);
                        ulong dstAddress = _dstGpuVa + (uint)dstOffset;
                        memoryManager.Write(dstAddress, MemoryMarshal.Cast>(data.Slice(srcOffset, 16))[0]);
                    }
                    for (; x < x2; x++, srcOffset++)
                    {
                        int dstOffset = dstCalculator.GetOffset(x, y);
                        ulong dstAddress = _dstGpuVa + (uint)dstOffset;
                        memoryManager.Write(dstAddress, data[srcOffset]);
                    }
                    // All lines must be aligned to 4 bytes, as the data is pushed one word at a time.
                    // If our copy length is not a multiple of 4, then we need to skip the padding bytes here.
                    int misalignment = _lineLengthIn & 3;
                    if (misalignment != 0)
                    {
                        srcOffset += 4 - misalignment;
                    }
                }
                _context.AdvanceSequence();
            }
            _finished = true;
        }
    }
}