public ONNXTensor Permute(int[] permutations)
        {
            // transpose both data & shape
            var transposedData  = Permute(m_Data, permutations);
            var transposedShape = ONNXLayout.Permute(m_Shape, permutations);

            return(new ONNXTensor(transposedData, transposedShape));
        }
        internal static Tensor Permute(Tensor inTensor, int[] permutations) // TODO: unify Permute() arguments
        {
            var padPermutationsToBarracudaRank = 4 - permutations.Length;

            if (padPermutationsToBarracudaRank > 0)
            {
                permutations = permutations.Concat(Enumerable.Range(permutations.Length, padPermutationsToBarracudaRank)).ToArray();
            }
            Debug.Assert(permutations.Length == 4);

            // See: https://stackoverflow.com/a/32034565
            Profiler.BeginSample("ONNXTensor.Permute");
            var outTensor = new Tensor(ONNXLayout.Permute(inTensor.shape.ToArray(), permutations));

            Debug.Assert(outTensor.length == inTensor.length);

            // {0, 2, 3, 1} => {0, 3, 1, 2}
            // {2, 3, 1, 0} => {3, 2, 0, 1}
            //              => {find_index(0), find_index(1), find_index(2), find_index(3)}
            var reversePermute = new int[permutations.Length];

            for (var i = 0; i < permutations.Length; ++i)
            {
                reversePermute[i] = Array.IndexOf(permutations, i);
            }

            // outTensor strides
            var outStrideC   = outTensor.channels;
            var outStrideWC  = outStrideC * outTensor.width;
            var outStrideHWC = outStrideWC * outTensor.height;

            var outStride = new int[reversePermute.Length];

            for (var i = 0; i < reversePermute.Length; ++i)
            {
                outStride[i] = new[] { 0, outStrideHWC, outStrideWC, outStrideC, 1 }
            }