/// <summary>
        /// Generalized tensor multiplication;
        /// </summary>
        public void Multiply(double scale, MultidimensionalArray A, MultidimensionalArray B, double thisscale, string ThisIndex, string Aindex, string Bindex)
        {
            MultiplyProgram mp = MultiplyProgram.Compile(ThisIndex, Aindex, Bindex, false);

            mp.CheckArgs(this, A, B);
            this.Multiply(scale, A, B, thisscale, ref mp, null);
        }
        /// <summary>
        /// Generalized tensor multiplication with index-transformation.
        /// </summary>
        unsafe public void Multiply(double scale, MultidimensionalArray A, MultidimensionalArray B, double thisscale,
                                    ref MultiplyProgram mp, int *IndexTrafo, int IndexTrafo_Length,
                                    int trfPreOffset_A = 0, int trfCycle_A = 1, int trfPostOffset_A = 0, int trfPreOffset_B = 0, int trfCycle_B = 1, int trfPostOffset_B = 0)
        {
            if (mp.DT != this.Dimension)
            {
                throw new ArgumentException();
            }
            if (mp.DA != A.Dimension)
            {
                throw new ArgumentException();
            }
            if (mp.DB != B.Dimension)
            {
                throw new ArgumentException();
            }

            unsafe {
                int DT = mp.DT;
                //int DA = mp.DA;
                //int DB = mp.DB;

                // running cycles:
                int *cycRunT = stackalloc int[3 * DT];
                int *cycRunA = cycRunT + DT;
                int *cycRunB = cycRunA + DT;

                int *lenRun = stackalloc int[DT];

                for (int i = 0; i < DT; i++)
                {
                    cycRunT[i] = this.GetCycle(i);

                    lenRun[i] = this.GetLength(i);
                }

                int *cycSumA = stackalloc int[MultiplyProgram.MAX_SUM_LOOPS];
                int *cycSumB = stackalloc int[MultiplyProgram.MAX_SUM_LOOPS];
                int *lenSum  = stackalloc int[MultiplyProgram.MAX_SUM_LOOPS];

#if DEBUG
                mp.CheckArgs(this, A, B);
#endif

                mp.GetCyclesAndRanks(cycRunA, cycRunB, lenSum, cycSumA, cycSumB, A, B);

                if (mp.NoOfSumCycles == 2)
                {
                    // for better loop unrolling, make sure the inner loop is the smaller one
                    if (lenSum[1] > lenSum[0])
                    {
                        SwapInt(cycSumA + 0, cycSumA + 1);
                        SwapInt(cycSumB + 0, cycSumB + 1);
                        SwapInt(lenSum + 0, lenSum + 1);
                    }
                }

                // Execute Tensor Multiplication
                // =============================
                fixed(double *pTstor = this.m_Storage, pAstor = A.m_Storage, pBstor = B.m_Storage)
                {
                    double *pT = pTstor + this.m_Offset;
                    double *pA = pAstor + A.m_Offset;
                    double *pB = pBstor + B.m_Offset;

                    if (mp.iTrafoIdx >= 0)
                    {
                        if (mp.iTrafoIdx > 0)
                        {
                            // transformed cycle MUST be the outer-most, i.e. the first one.
                            // => need to shift some cycles.

                            int kk = mp.iTrafoIdx;
                            SwapInt(lenRun + 0, lenRun + kk);
                            SwapInt(cycRunT + 0, cycRunT + kk);
                            SwapInt(cycRunA + 0, cycRunA + kk);
                            SwapInt(cycRunB + 0, cycRunB + kk);
                        }


                        if (IndexTrafo == null)
                        {
                            throw new ArgumentException("Index transformation required.");
                        }

                        int *pIndexTrafo = IndexTrafo;
                        {
                            //Debug.Assert(mp.NoOfSumCycles == 1); Debug.Assert(DT == 2); __MultiplyWTrafo_Sum1_FOR2(
                            MultiplyWTrafo_Dispatch(
                                DT, mp.NoOfSumCycles, pT, pA, pB, lenRun, cycRunT, cycRunA, cycRunB, lenSum, cycSumA, cycSumB, scale, thisscale,
                                pIndexTrafo, IndexTrafo_Length, mp.TrfT0Sw, mp.TrfA0Sw, mp.TrfB0Sw,
                                0, 1, 0, trfPreOffset_A, trfCycle_A, trfPostOffset_A, trfPreOffset_B, trfCycle_B, trfPostOffset_B);
                        }
                    }
                    else
                    {
                        Multiply_Dispatch(DT, mp.NoOfSumCycles, pT, pA, pB, lenRun, cycRunT, cycRunA, cycRunB, lenSum, cycSumA, cycSumB, scale, thisscale);
                    }
                }
            }
        }