/// <summary>
        /// Generalized tensor multiplication;
        /// </summary>
        public void Multiply(double scale, MultidimensionalArray A, MultidimensionalArray B, double thisscale, string ThisIndex, string Aindex, string Bindex)
        {
            MultiplyProgram mp = MultiplyProgram.Compile(ThisIndex, Aindex, Bindex, false);

            mp.CheckArgs(this, A, B);
            this.Multiply(scale, A, B, thisscale, ref mp, null);
        }
            /// <summary>
            /// Compiles a 'program' to compute e.g. a tensor product
            /// like
            /// \f[
            ///    T_{i j n} = \sum{k m} A_{i k m} B_{k j n m}.
            /// \f]
            /// In this example, <paramref name="Tindex"/> = "ijn", <paramref name="Aindex"/> = "ikm" and <paramref name="Bindex"/> = "kjnm".
            /// </summary>
            /// <param name="Tindex">Indices into result array.</param>
            /// <param name="Aindex">Indices into first tensor.</param>
            /// <param name="Bindex">Indices into second tensor.</param>
            /// <param name="detectTrafo">
            /// Enables index-transformation.
            /// </param>
            /// <returns></returns>
            public static MultiplyProgram Compile(string Tindex, string Aindex, string Bindex, bool detectTrafo = false)
            {
                MultiplyProgram p = default(MultiplyProgram);


                unsafe {
                    char[] TIdxNames = Tindex.ToCharArray();
                    char[] AIdxNames = Aindex.ToCharArray();
                    char[] BIdxNames = Bindex.ToCharArray();

                    char  TrafoName = (char)0, TrafoVarName = (char)0;
                    bool  useTrafo          = false;
                    bool *TtrafoIndexMarker = stackalloc bool[TIdxNames.Length];
                    bool *AtrafoIndexMarker = stackalloc bool[Aindex.Length];
                    bool *BtrafoIndexMarker = stackalloc bool[Bindex.Length];
                    if (detectTrafo)
                    {
                        p.TrfT0Sw = IdentifyTrafoIndex(ref TIdxNames, &useTrafo, &TrafoName, &TrafoVarName, TtrafoIndexMarker) ? 1 : 0;
                        if (p.TrfT0Sw != 0)
                        {
                            throw new ArgumentException("Index transformation is not supported for the result array.");
                        }

                        p.TrfA0Sw = IdentifyTrafoIndex(ref AIdxNames, &useTrafo, &TrafoName, &TrafoVarName, AtrafoIndexMarker) ? 1 : 0;
                        p.TrfB0Sw = IdentifyTrafoIndex(ref BIdxNames, &useTrafo, &TrafoName, &TrafoVarName, BtrafoIndexMarker) ? 1 : 0;

                        if ((p.TrfT0Sw * p.TrfA0Sw * p.TrfB0Sw) != 0)
                        {
                            throw new ArgumentException("Index-transformation can only be applied to one or two operands.");
                        }
                    }

                    p.DT = TIdxNames.Length;
                    p.DA = AIdxNames.Length;
                    p.DB = BIdxNames.Length;

                    // check
                    p.iTrafoIdx = int.MinValue;
                    for (int i = 0; i < p.DT; i++)
                    {
                        if (useTrafo && TIdxNames[i] == TrafoVarName)
                        {
                            if (p.iTrafoIdx >= 0)
                            {
                                throw new ArgumentException("Only one index of can be transformed.");
                            }
                            else
                            {
                                p.iTrafoIdx = i;
                            }
                        }

                        for (int j = i + 1; j < p.DT; j++)
                        {
                            if (TIdxNames[i] == TIdxNames[j])
                            {
                                throw new ArgumentException("found non-unique running index name.", "ThisIndex");
                            }
                        }
                    }

                    // Identify running and summation indices, setup running cycles (array A)
                    // ======================================================================
                    bool *bSumMarkerA = stackalloc bool[p.DA];
                    for (int j = 0; j < p.DA; j++)
                    {
                        bSumMarkerA[j] = true;
                        for (int i = 0; i < p.DT; i++)
                        {
                            if (AIdxNames[j] == TIdxNames[i])
                            {
                                //if(A.GetLength(j) != this.GetLength(i))
                                //    throw new ArgumentException(string.Format("length mismatch in running index '{0}', '{1}'-th index in this, '{2}'-th index in 'A'", AIdxNames[j], i, j), "A");
                                //cycRunA[i] += A.GetCycle(j);

                                p.AddRunRankA(i, j);

                                bSumMarkerA[j] = false;
                                break;
                            }
                        }

                        if (bSumMarkerA[j] && AtrafoIndexMarker[j])
                        {
                            throw new NotSupportedException("Index transformation is not supported for summation indices.");
                        }
                    }

                    // Identify running and summation indices, setup running cycles (array B)
                    // ======================================================================
                    bool *bSumMarkerB = stackalloc bool[p.DB];
                    for (int j = 0; j < p.DB; j++)  // loop over B-ranks
                    {
                        bSumMarkerB[j] = true;
                        for (int i = 0; i < p.DT; i++)  // loop over running indices (T-ranks)
                        {
                            if (BIdxNames[j] == TIdxNames[i])
                            {
                                //if(B.GetLength(j) != this.GetLength(i))
                                //    throw new ArgumentException(string.Format("length mismatch in running index '{0}', '{1}'-th index in this, '{2}'-th index in 'B'", BIdxNames[j], i, j), "B");
                                //cycRunB[i] += B.GetCycle(j);

                                p.AddRunRankB(i, j);

                                bSumMarkerB[j] = false;
                                break;
                            }
                        }

                        if (bSumMarkerB[j] && BtrafoIndexMarker[j])
                        {
                            throw new NotSupportedException("Index transformation is not supported for summation indices.");
                        }
                    }

                    // collect & check summation cycles
                    // ================================
                    p.NoOfSumCycles = 0;
                    char *SumIdxNames = stackalloc char[MAX_SUM_LOOPS];

                    for (int j1 = 0; j1 < p.DA; j1++)
                    {
                        if (bSumMarkerA[j1])
                        {
                            bool twiceOrMore = false;
                            int  k;
                            for (k = 0; k < p.NoOfSumCycles; k++)
                            {
                                if (SumIdxNames[k] == AIdxNames[j1])
                                {
                                    // summation index occurs more than once!
                                    //cycSumA[k] += A.GetCycle(j1);
                                    p.AddSumRankA(k, j1);
                                    twiceOrMore = true;
                                    break;
                                }
                            }

                            if (!twiceOrMore)
                            {
                                if (k >= MAX_SUM_LOOPS)
                                {
                                    throw new NotSupportedException("More than '" + MAX_SUM_LOOPS + "' summation loops are currently not supported.");
                                }
                                SumIdxNames[k] = AIdxNames[j1];
                                //cycSumA[k] += A.GetCycle(j1);
                                //lenSum[k] = A.GetLength(j1);
                                p.AddSumRankA(k, j1);
                                p.NoOfSumCycles++;
                            }

                            bool bfound = false;
                            for (int j2 = 0; j2 < p.DB; j2++)
                            {
                                if (!bSumMarkerB[j2])
                                {
                                    continue;
                                }

                                if (BIdxNames[j2] == AIdxNames[j1])
                                {
                                    bfound = true;

                                    //if(A.GetLength(j1) != B.GetLength(j2))
                                    //    throw new ArgumentException(string.Format("length mismatch for summation index '{0}'", AIdxNames[j1]));

                                    //cycSumB[k] += B.GetCycle(j2);
                                    p.AddSumRankB(k, j2);
                                    bSumMarkerB[j2] = false;
                                }
                            }

                            if (bfound == false && twiceOrMore == false)
                            {
                                throw new ArgumentException(string.Format("summation index '{0}' present in 'A', but not in 'B'", AIdxNames[j1]));
                            }
                        }
                    }

                    for (int j2 = 0; j2 < p.DB; j2++)
                    {
                        if (bSumMarkerB[j2])
                        {
                            throw new ArgumentException(string.Format("summation index '{0}' present in 'B', but not in 'A'", BIdxNames[j2]));
                        }
                    }

                    return(p);
                }
            }
        /// <summary>
        /// Generalized tensor multiplication with index-transformation.
        /// </summary>
        unsafe public void Multiply(double scale, MultidimensionalArray A, MultidimensionalArray B, double thisscale,
                                    ref MultiplyProgram mp, int *IndexTrafo, int IndexTrafo_Length,
                                    int trfPreOffset_A = 0, int trfCycle_A = 1, int trfPostOffset_A = 0, int trfPreOffset_B = 0, int trfCycle_B = 1, int trfPostOffset_B = 0)
        {
            if (mp.DT != this.Dimension)
            {
                throw new ArgumentException();
            }
            if (mp.DA != A.Dimension)
            {
                throw new ArgumentException();
            }
            if (mp.DB != B.Dimension)
            {
                throw new ArgumentException();
            }

            unsafe {
                int DT = mp.DT;
                //int DA = mp.DA;
                //int DB = mp.DB;

                // running cycles:
                int *cycRunT = stackalloc int[3 * DT];
                int *cycRunA = cycRunT + DT;
                int *cycRunB = cycRunA + DT;

                int *lenRun = stackalloc int[DT];

                for (int i = 0; i < DT; i++)
                {
                    cycRunT[i] = this.GetCycle(i);

                    lenRun[i] = this.GetLength(i);
                }

                int *cycSumA = stackalloc int[MultiplyProgram.MAX_SUM_LOOPS];
                int *cycSumB = stackalloc int[MultiplyProgram.MAX_SUM_LOOPS];
                int *lenSum  = stackalloc int[MultiplyProgram.MAX_SUM_LOOPS];

#if DEBUG
                mp.CheckArgs(this, A, B);
#endif

                mp.GetCyclesAndRanks(cycRunA, cycRunB, lenSum, cycSumA, cycSumB, A, B);

                if (mp.NoOfSumCycles == 2)
                {
                    // for better loop unrolling, make sure the inner loop is the smaller one
                    if (lenSum[1] > lenSum[0])
                    {
                        SwapInt(cycSumA + 0, cycSumA + 1);
                        SwapInt(cycSumB + 0, cycSumB + 1);
                        SwapInt(lenSum + 0, lenSum + 1);
                    }
                }

                // Execute Tensor Multiplication
                // =============================
                fixed(double *pTstor = this.m_Storage, pAstor = A.m_Storage, pBstor = B.m_Storage)
                {
                    double *pT = pTstor + this.m_Offset;
                    double *pA = pAstor + A.m_Offset;
                    double *pB = pBstor + B.m_Offset;

                    if (mp.iTrafoIdx >= 0)
                    {
                        if (mp.iTrafoIdx > 0)
                        {
                            // transformed cycle MUST be the outer-most, i.e. the first one.
                            // => need to shift some cycles.

                            int kk = mp.iTrafoIdx;
                            SwapInt(lenRun + 0, lenRun + kk);
                            SwapInt(cycRunT + 0, cycRunT + kk);
                            SwapInt(cycRunA + 0, cycRunA + kk);
                            SwapInt(cycRunB + 0, cycRunB + kk);
                        }


                        if (IndexTrafo == null)
                        {
                            throw new ArgumentException("Index transformation required.");
                        }

                        int *pIndexTrafo = IndexTrafo;
                        {
                            //Debug.Assert(mp.NoOfSumCycles == 1); Debug.Assert(DT == 2); __MultiplyWTrafo_Sum1_FOR2(
                            MultiplyWTrafo_Dispatch(
                                DT, mp.NoOfSumCycles, pT, pA, pB, lenRun, cycRunT, cycRunA, cycRunB, lenSum, cycSumA, cycSumB, scale, thisscale,
                                pIndexTrafo, IndexTrafo_Length, mp.TrfT0Sw, mp.TrfA0Sw, mp.TrfB0Sw,
                                0, 1, 0, trfPreOffset_A, trfCycle_A, trfPostOffset_A, trfPreOffset_B, trfCycle_B, trfPostOffset_B);
                        }
                    }
                    else
                    {
                        Multiply_Dispatch(DT, mp.NoOfSumCycles, pT, pA, pB, lenRun, cycRunT, cycRunA, cycRunB, lenSum, cycSumA, cycSumB, scale, thisscale);
                    }
                }
            }
        }
 /// <summary>
 /// Generalized tensor multiplication with index-transformation.
 /// </summary>
 public void Multiply(double scale, MultidimensionalArray A, MultidimensionalArray B, double thisscale, ref MultiplyProgram mp, int[] IndexTrafo = null)
 {
     unsafe
     {
         fixed(int *pIndexTrafo = IndexTrafo)
         {
             Multiply(scale, A, B, thisscale, ref mp, pIndexTrafo, IndexTrafo != null ? IndexTrafo.Length : 0);
         }
     }
 }