public void Dgemm(bool transposeA, bool transposeB, double alpha, DoubleMatrix2D A, DoubleMatrix2D B, double beta, DoubleMatrix2D C) { /* * determine how to split and parallelize best into blocks * if more B.columns than tasks --> split B.columns, as follows: * * xx|xx|xxx B * xx|xx|xxx * xx|xx|xxx * A * xxx xx|xx|xxx C * xxx xx|xx|xxx * xxx xx|xx|xxx * xxx xx|xx|xxx * xxx xx|xx|xxx * * if less B.columns than tasks --> split A.rows, as follows: * * xxxxxxx B * xxxxxxx * xxxxxxx * A * xxx xxxxxxx C * xxx xxxxxxx * --- ------- * xxx xxxxxxx * xxx xxxxxxx * --- ------- * xxx xxxxxxx * */ if (transposeA) { Dgemm(false, transposeB, alpha, A.ViewDice(), B, beta, C); return; } if (transposeB) { Dgemm(transposeA, false, alpha, A, B.ViewDice(), beta, C); return; } int m = A.Rows; int n = A.Columns; int p = B.Columns; if (B.Rows != n) { throw new ArgumentException("Matrix2D inner dimensions must agree:" + A.ToStringShort() + ", " + B.ToStringShort()); } if (C.Rows != m || C.Columns != p) { throw new ArgumentException("Incompatibel result matrix: " + A.ToStringShort() + ", " + B.ToStringShort() + ", " + C.ToStringShort()); } if (A == C || B == C) { throw new ArgumentException("Matrices must not be identical"); } long flops = 2L * m * n * p; int noOfTasks = (int)System.Math.Min(flops / 30000, this.maxThreads); // each thread should process at least 30000 flops Boolean splitB = (p >= noOfTasks); int width = splitB ? p : m; noOfTasks = System.Math.Min(width, noOfTasks); if (noOfTasks < 2) { // parallelization doesn't pay off (too much start up overhead) seqBlas.Dgemm(transposeA, transposeB, alpha, A, B, beta, C); return; } // set up concurrent tasks int span = width / noOfTasks; //FJTask[] subTasks = new FJTask[noOfTasks]; for (int i = 0; i < noOfTasks; i++) { int offset = i * span; if (i == noOfTasks - 1) { span = width - span * i; // last span may be a bit larger } DoubleMatrix2D AA, BB, CC; if (splitB) { // split B along columns into blocks AA = A; BB = B.ViewPart(0, offset, n, span); CC = C.ViewPart(0, offset, m, span); } else { // split A along rows into blocks AA = A.ViewPart(offset, 0, span, n); BB = B; CC = C.ViewPart(offset, 0, span, p); } Action task = (() => { seqBlas.Dgemm(transposeA, transposeB, alpha, AA, BB, beta, CC); }); // run tasks and wait for completion try { this.smp.TaskGroup.QueueTask(() => task()); } catch (TaskCanceledException exc) { } } }
/// <summary> /// Linear algebraic matrix power; <i>B = A<sup>k</sup> <==> B = A*A*...*A</i>. /// <ul> /// <li><i>p >= 1: B = A*A*...*A</i>.</li> /// <li><i>p == 0: B = identity matrix</i>.</li> /// <li><i>p < 0: B = pow(inverse(A),-p)</i>.</li> /// </ul> /// Implementation: Based on logarithms of 2, memory usage minimized. /// </summary> /// <param name="A">the source matrix; must be square; stays unaffected by this operation.</param> /// <param name="p">the exponent, can be any number.</param> /// <returns><i>B</i>, a newly constructed result matrix; storage-independent of <i>A</i>.</returns> ///<exception cref="ArgumentException">if <i>!property().isSquare(A)</i>.</exception> public static DoubleMatrix2D Pow(DoubleMatrix2D A, int p) { // matrix multiplication based on log2 method: A*A*....*A is slow, ((A * A)^2)^2 * ..D is faster // allocates two auxiliary matrices as work space IBlas blas = SmpBlas.smpBlas; // for parallel matrix mult; if not initialized defaults to sequential blas Property.DEFAULT.CheckSquare(A); if (p < 0) { A = Inverse(A); p = -p; } if (p == 0) { return(DoubleFactory2D.Dense.Identity(A.Rows)); } DoubleMatrix2D T = A.Like(); // temporary if (p == 1) { return(T.Assign(A)); // safes one auxiliary matrix allocation } if (p == 2) { blas.Dgemm(false, false, 1, A, A, 0, T); // mult(A,A); // safes one auxiliary matrix allocation return(T); } int k = Cern.Colt.Bitvector.QuickBitVector.MostSignificantBit(p); // index of highest bit in state "true" /* * this is the naive version: * DoubleMatrix2D B = A.Copy(); * for (int i=0; i<p-1; i++) { * B = mult(B,A); * } * return B; */ // here comes the optimized version: //cern.colt.Timer timer = new cern.colt.Timer().start(); int i = 0; while (i <= k && (p & (1 << i)) == 0) { // while (bit i of p == false) // A = mult(A,A); would allocate a lot of temporary memory blas.Dgemm(false, false, 1, A, A, 0, T); // A.zMult(A,T); DoubleMatrix2D swap = A; A = T; T = swap; // swap A with T i++; } DoubleMatrix2D B = A.Copy(); i++; for (; i <= k; i++) { // A = mult(A,A); would allocate a lot of temporary memory blas.Dgemm(false, false, 1, A, A, 0, T); // A.zMult(A,T); DoubleMatrix2D swap = A; A = T; T = swap; // swap A with T if ((p & (1 << i)) != 0) { // if (bit i of p == true) // B = mult(B,A); would allocate a lot of temporary memory blas.Dgemm(false, false, 1, B, A, 0, T); // B.zMult(A,T); swap = B; B = T; T = swap; // swap B with T } } //timer.stop().Display(); return(B); }