private static void ExecuteSingleVectorOperation <T>(BinaryExpression operation, LinearAlgebraProvider provider, ArrayPoolStack <T> arrayPoolStack, List <NArray <T> > localsToFree, OutputGetter <T> getter, Aggregator aggregator, int vectorLength, int chunkIndex, int startIndex, ExecutionTimer timer) { if (operation == null || operation.NodeType != ExpressionType.Assign) { return; } if (operation == null) { return; } NArray <T> result; var left = operation.Left as ReferencingVectorParameterExpression <T>; NArray <T> aggregationTarget = null; if (left.ParameterType == ParameterType.Local) { result = left.Array; var chunkyStorage = result.Storage as ChunkyStorage <T>; if (chunkyStorage != null) { var newStorage = arrayPoolStack.Pop(); chunkyStorage.SetChunk(chunkIndex, newStorage); } aggregationTarget = getter.TryGetNext(left.Index); } else { result = (operation.Left as ReferencingVectorParameterExpression <T>).Array; } if (operation.Right is UnaryMathsExpression) { var unaryOperation = operation.Right as UnaryMathsExpression; if (unaryOperation.UnaryType == UnaryElementWiseOperation.ScaleOffset) { var scaleOffset = unaryOperation as ScaleOffsetExpression <T>; (provider as IElementWise <T>).ScaleOffset(Slice <T>(unaryOperation.Operand, chunkIndex, startIndex, vectorLength), scaleOffset.Scale, scaleOffset.Offset, Slice(result, chunkIndex, startIndex, vectorLength)); } else if (unaryOperation.UnaryType == UnaryElementWiseOperation.ScaleInverse) { var scaleInverse = unaryOperation as ScaleInverseExpression <T>; (provider as IElementWise <T>).ScaleInverse(Slice <T>(unaryOperation.Operand, chunkIndex, startIndex, vectorLength), scaleInverse.Scale, Slice(result, chunkIndex, startIndex, vectorLength)); } else { (provider as IElementWise <T>).UnaryElementWiseOperation( Slice <T>(unaryOperation.Operand, chunkIndex, startIndex, vectorLength), Slice(result, chunkIndex, startIndex, vectorLength), unaryOperation.UnaryType); } } if (operation.Right is BinaryExpression) { var binaryOperation = operation.Right as BinaryExpression; (provider as IElementWise <T>).BinaryElementWiseOperation(Slice <T>(binaryOperation.Left, chunkIndex, startIndex, vectorLength), Slice <T>(binaryOperation.Right, chunkIndex, startIndex, vectorLength), Slice(result, chunkIndex, startIndex, vectorLength), binaryOperation.NodeType); } if (aggregationTarget != null) { if (aggregator != Aggregator.ElementwiseAdd) { throw new NotImplementedException(); } if (aggregationTarget.IsScalar) { throw new Exception(); } var slice = Slice <T>(aggregationTarget, chunkIndex, startIndex, vectorLength); (provider as IElementWise <T>).BinaryElementWiseOperation( slice, Slice(result, chunkIndex, startIndex, vectorLength), slice, ExpressionType.Add); } foreach (var item in localsToFree) { arrayPoolStack.Push((item.Storage as ChunkyStorage <T>).GetChunk(chunkIndex)); } }
/// <summary> /// Here we execute the expression, but without attempting to compile it. /// The performance gain comes only from reducing the amount of memory allocation required, not /// from compiling an optimised kernel. /// </summary> /// <param name="block"></param> /// <param name="provider"></param> /// <param name="vectorOptions"></param> public static void RunNonCompiling(BlockExpressionBuilder _builder, LinearAlgebraProvider provider, VectorExecutionOptions vectorOptions, NArray[] outputs, int[] outputsIndices, Aggregator aggregator, ExecutionTimer timer) { var block = _builder.ToBlock(); if (block.Operations.Count == 0) { return; } // we will cycle through arrays in order of increasing index var getter = new OutputGetter <double>(outputs, outputsIndices); int chunksLength = _builder.VectorLength; //5000; var arrayPoolStack = ExecutionContext.ArrayPool.GetStack(chunksLength); int length = (block.ArgumentParameters.First() as ReferencingVectorParameterExpression <double>) .Array.Length; int chunkCount = length / chunksLength; if (length % chunksLength != 0) { chunkCount++; } List <NArray <double> >[] localsToFree; // the storage that can be freed up after each operation is complete AssignNArrayStorage <double>(block, chunkCount, chunksLength, length, out localsToFree); // for integer support, add here var options = new ParallelOptions(); if (!vectorOptions.MultipleThreads) { options.MaxDegreeOfParallelism = 1; } timer.MarkExecutionTemporaryStorageAllocationComplete(); // can multi-thread here, but better to do so at higher level //Parallel.For(0, chunkCount, options, (i) => for (int i = 0; i < chunkCount; ++i) { int startIndex = i * chunksLength; int vectorLength = Math.Min(chunksLength, length - startIndex); for (int j = 0; j < block.Operations.Count; ++j) { var operation = block.Operations[j]; if (operation.Type == typeof(NArray)) { var newOperation = _builder.SimplifyOperation(operation); // deal with any expressions containing scalars that can be simplified ExecuteSingleVectorOperation <double>(newOperation, provider, arrayPoolStack, localsToFree[j], getter, aggregator, vectorLength, i, startIndex, timer); } } } ; if (!arrayPoolStack.StackCountEqualsCreated) { throw new Exception("not all storage arrays created returned to stack"); } }