public void TensorCachingAllocatorTest()
    {
        ReferenceComputeOps gpuOps;

        Debug.Log(ComputeShaderSingleton.Instance);
        gpuOps = new ReferenceComputeOps(ComputeShaderSingleton.Instance.referenceKernels);

        TensorCachingAllocator tca = new TensorCachingAllocator();

        int[]  shape = new[] { 2, 3, 5, 1 };
        Tensor X     = tca.Alloc(new TensorShape(shape));
        Tensor W     = tca.Alloc(new TensorShape(15, 7));

        X[0] = 3;
        W[0] = 5;
        Debug.Log($"X WxH:{X.flatHeight} {X.flatWidth}");
        Debug.Log($"W WxH:{W.flatHeight} {W.flatWidth}");
        Tensor Y = gpuOps.MatMul(X, false, W, false);

        Debug.Log($"Y WxH:{Y.flatHeight} {Y.flatWidth}");
        Debug.Log(X.data.GetType());
        tca.Dispose();
        gpuOps.ResetAllocator(false);
        Debug.Assert(true); // Just getting here is good enough
    }
    public void ReferenceComputeOps_BasicTest()
    {
        ReferenceComputeOps gpuOps;

        Debug.Log(ComputeShaderSingleton.Instance);
        gpuOps = new ReferenceComputeOps(ComputeShaderSingleton.Instance.referenceKernels);
        int[]  shape = new[] { 2, 3, 5, 1 };
        Tensor X     = new Tensor(shape, "TestX");
        Tensor W     = new Tensor(new TensorShape(15, 7), "TestW");

        X[0] = 3;
        W[0] = 5;
        Debug.Log($"X WxH:{X.flatHeight} {X.flatWidth}");
        Debug.Log($"W WxH:{W.flatHeight} {W.flatWidth}");
        Tensor Y = gpuOps.MatMul(X, false, W, false);

        Debug.Log($"Y WxH:{Y.flatHeight} {Y.flatWidth}");
        X.Dispose();
        W.Dispose();
        Y.Dispose();
        gpuOps.ResetAllocator(false);
        Debug.Assert(true); // Just getting here is good enough
    }