public MyFourierBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock<float> tempBlock) : base(owner, inputSize, tempBlock) { m_stream = new CudaStream(); m_fft = new CudaFFTPlan1D(inputSize, cufftType.R2C, 1); m_fft.SetStream(m_stream.Stream); m_ifft = new CudaFFTPlan1D(inputSize, cufftType.C2R, 1); m_ifft.SetStream(m_stream.Stream); m_mulkernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "MulComplexElementWise"); m_mulkernel.SetupExecution(inputSize + 1); m_involutionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "InvolveVector"); m_involutionKernel.SetupExecution(inputSize - 1); m_inversionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\InvertValuesKernel", "InvertLengthComplexKernel"); m_inversionKernel.SetupExecution(inputSize); m_dotKernel = MyKernelFactory.Instance.KernelProduct<float>(owner, owner.GPU, ProductMode.f_DotProduct_f); m_normalKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\TransformKernels", "PolynomialFunctionKernel"); m_normalKernel.SetupExecution(inputSize); m_firstFFTOffset = 0; m_secondFFTOffset = (inputSize + 1) * 2; m_tempOffset = (inputSize + 1) * 4; Denominator = inputSize; }
public VectorOps(MyWorkingNode caller, VectorOperation operations, MyMemoryBlock<float> tempBlock) { m_caller = caller; m_operations = operations; m_temp = tempBlock; MatOperation mat_ops = MatOperation.None; if (m_operations.HasFlag(VectorOperation.Rotate)) { Debug.Assert(tempBlock.Count >= 4, "Temporary memory block has to be large at least 4 items when using Rotate operation"); mat_ops |= MatOperation.Multiplication; } if (m_operations.HasFlag(VectorOperation.Angle)) mat_ops |= MatOperation.DotProd; if (m_operations.HasFlag(VectorOperation.DirectedAngle)) { mat_ops |= MatOperation.Multiplication | MatOperation.DotProd; m_operations |= VectorOperation.Angle | VectorOperation.Rotate; } m_matOperation = new MyMatrixAutoOps(caller, mat_ops); }
public override void Run(MatOperation operation, MyMemoryBlock<float> A, MyMemoryBlock<float> B, MyMemoryBlock<float> Result) { Result.Fill(.0f); switch (operation) { case MatOperation.EuclidDist: if (B.Count == A.ColumnHint) { A.SafeCopyToHost(); B.SafeCopyToHost(); for (int row = 0; row < A.Count / A.ColumnHint; row++) { Result.Host[row] = 0; for (int Bindex = 0; Bindex < B.Count; Bindex++) { Result.Host[row] += (B.Host[Bindex] - A.Host[A.ColumnHint * row + Bindex]) * (B.Host[Bindex] - A.Host[A.ColumnHint * row + Bindex]); } Result.Host[row] = (float)Math.Sqrt( (double) Result.Host[row] ); //System.Console.Write(" " + Result.Host[row]); } Result.SafeCopyToDevice(); } break; default: MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cpu mat ops. for undefined MatOperation"); break; } }
public MyGradientBackPropAgent(MyAbstractFeedForwardNode network, int nGPU, MyMemoryBlock<float> labelInput) : base(network) { m_updateWeightKernel = MyKernelFactory.Instance.Kernel(nGPU, @"XmlFeedForwardNet\UpdateWeightKernel"); DeltaProvider = new MyLabelDeltaProvider(m_network, nGPU); DeltaProvider.LabelInput = labelInput; }
public MyPermutationBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock<float> tempBlock) : base(owner, inputSize, tempBlock) { m_PermKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "CombineVectorsKernel"); m_PermKernel.SetupExecution(inputSize); m_binaryPermKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "CombineTwoVectorsKernel"); m_binaryPermKernel.SetupExecution(inputSize); }
public void SetInputMemoryBlock(MyMemoryBlock<float> input, uint inputOffset = 0, uint sampleOffset = 0) { m_inputBlock = input; m_inputOffset = inputOffset; m_outputBlock = m_inputBlock; m_outputOffset = m_inputOffset; SendInputSampleOffsetToGPU(sampleOffset); }
public static MyMemoryBlock<float> SetupResultSize(MatOperation operation, MyMemoryBlock<float> A, MyMemoryBlock<float> B, MyMemoryBlock<float> Result) { Result.Count = A != null ? A.Count : 1; Result.ColumnHint = A != null ? A.ColumnHint : 1; if (A != null) { if (operation == MatOperation.DotProd) { Result.Count = Result.ColumnHint = 1; } else if (operation == MatOperation.AbsMinIndex || operation == MatOperation.AbsMaxIndex) { Result.ColumnHint = 1; Result.Count = 1; } else if (operation == MatOperation.Multiplication) { if (A != null && B != null && A.ColumnHint != 0 && B.Count > 1) { Result.ColumnHint = B.ColumnHint; Result.Count = B.ColumnHint * A.Count / A.ColumnHint; } } else if (operation == MatOperation.GetCol) { Result.Count = A.Count / A.ColumnHint; Result.ColumnHint = Result.Count; } else if (operation == MatOperation.GetRow) { Result.Count = A.ColumnHint; Result.ColumnHint = Result.Count; } else if (B != null && (operation == MatOperation.MultiplElemntWise || operation == MatOperation.Addition)) { Result.ColumnHint = Math.Max(A.ColumnHint, B.ColumnHint); Result.Count = Math.Max(A.Count, B.Count); } else if (operation == MatOperation.Transpose) { if ((A.ColumnHint != 0) && (A.Count > 0)) // prevent dimension of size 0 { Result.Dims.Set(new[] { -1, A.Count / A.ColumnHint }); } } else if (operation == MatOperation.EuclidDist) { if (B != null) { Result.Count = A.Count / A.ColumnHint; Result.ColumnHint = 1; } } } return Result; }
public MyMatrixKernelOps(MyWorkingNode callee, MatOperation operations, MyMemoryBlock<float> A, MyMemoryBlock<float> B = null) { OpersKerlsDictionary = new Dictionary<MatOperation, MyCudaKernel>(); this.callee = callee; if ((operations & MatOperation.Log) > 0) { OpersKerlsDictionary.Add(MatOperation.Log, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "LogKernel_naive")); } if ((operations & MatOperation.Exp) > 0) { OpersKerlsDictionary.Add(MatOperation.Exp, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "ExpKernel_naive")); } if ((operations & MatOperation.Round) > 0) { OpersKerlsDictionary.Add(MatOperation.Round, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "RoundKernel_naive")); } if ((operations & MatOperation.Floor) > 0) { OpersKerlsDictionary.Add(MatOperation.Floor, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "FloorKernel_naive")); } if ((operations & MatOperation.Ceil) > 0) { OpersKerlsDictionary.Add(MatOperation.Ceil, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "CeilKernel_naive")); } if ((operations & MatOperation.Abs) > 0) { OpersKerlsDictionary.Add(MatOperation.Abs, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "AbsKernel_naive")); } if ((operations & MatOperation.GetCol) > 0) { OpersKerlsDictionary.Add(MatOperation.GetCol, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "Matrix_getCol_FloatId_naive")); } if ((operations & MatOperation.GetRow) > 0) { OpersKerlsDictionary.Add(MatOperation.GetRow, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "Matrix_getRow_FloatId_naive")); } if ((operations & MatOperation.MultiplElemntWise) > 0) { OpersKerlsDictionary.Add(MatOperation.MultiplElemntWise, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "Matrix_MultiplElementWise_naive")); } if ((operations & MatOperation.Addition) > 0) { OpersKerlsDictionary.Add(MatOperation.Addition, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "Matrix_Addition_naive")); } if ((operations & MatOperation.Substraction) > 0) { OpersKerlsDictionary.Add(MatOperation.Substraction, MyKernelFactory.Instance.Kernel(callee.GPU, @"Vision\Matrix", "Matrix_Substraction_naive")); } if (operations > 0 && OpersKerlsDictionary.Count == 0) { MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to init kernel MatrixOps for undefined MatOperation"); } }
public virtual void Unbind(MyMemoryBlock<float> firstInput, MyMemoryBlock<float> secondInput, MyMemoryBlock<float> output) { int nrInputs = secondInput.Count / m_inputSize; var vecs = nrInputs > 1 // Concatenate pointers to the individual vectors ? Enumerable.Range(0, nrInputs).Select(i => secondInput.GetDevicePtr(m_owner) + i * m_inputSize * sizeof(float)) // Use only a singe pointer : Enumerable.Repeat(secondInput.GetDevicePtr(m_owner), 1); Unbind(firstInput.GetDevicePtr(m_owner), vecs, output.GetDevicePtr(m_owner)); }
public MyInputLayer(MyAbstractFeedForwardNode network, MyMemoryBlock<float> input, SizeT offset, SizeT nb, SizeT width, SizeT height, SizeT nbSamplesPerStep) : base(network) { m_inputBlock = input; m_inputOffset = offset; m_output.Nb = nb; m_output.Width = width; m_output.Height = height; m_nbSamplesPerStep = nbSamplesPerStep; }
public static bool Validate(MyStackingOperation operation, IEnumerable<MyMemoryBlock<float>> inputs, MyMemoryBlock<float> output, out string errorOutput) { errorOutput = null; inputs = inputs.Where(a => a != null); switch (operation) { case MyStackingOperation.None: return true; case MyStackingOperation.Concatenate: case MyStackingOperation.Interweave: break; default: errorOutput = "Invalid operation. Only a single value within the enum range should be passed."; return false; } if (!inputs.Any()) { errorOutput = "No inputs for stacking operation to run on."; return false; } if (operation == MyStackingOperation.Interweave) { if (inputs.Any(a => a.ColumnHint == 0)) { errorOutput = "Invalid column hints. They must be positive."; return false; } MyMemoryBlock<float> first = inputs.First(); int rows = first.Count / first.ColumnHint; if (inputs.Any(a => a.Count / a.ColumnHint != rows)) { errorOutput = "Invalid input row counts. Inputs must have the same number of rows."; return false; } } if (inputs.Sum(a => a.Count) > output.Count) { errorOutput = "Invalid output size: " + output.Count + ". Must be large enough to contain all the inputs."; return false; } return true; }
public override void AllocateMemory() { base.AllocateMemory(); m_delta = m_output; m_deltaBlock = m_network.DeltasMemoryBlock; m_deltaOffset = m_network.DeltasMemoryBlock.Count; m_deltaDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; m_network.DeltasMemoryBlock.Count += m_delta.Count; }
public virtual void Bind(MyMemoryBlock<float> inputs, MyMemoryBlock<float> output) { int nrInputs = inputs.Count / m_inputSize; CUdeviceptr start = inputs.GetDevicePtr(m_owner); CUdeviceptr[] arr = GetTempArray(nrInputs); //-1 to skip the first +1 to include output for (int i = 0; i < nrInputs - 1; ++i) { arr[i] = start + (i + 1) * m_inputSize * sizeof(float); } arr[nrInputs - 1] = output.GetDevicePtr(m_owner); Bind(start, arr); }
public MyMatrixAutoOps(MyWorkingNode callee, MatOperation operations, MyMemoryBlock<float> A = null) { if ((MyMatrixKernelOps.AvailableOperations() & operations) > 0) { MatKerlOps = new MyMatrixKernelOps(callee, operations); } if ((MyMatrixCublasOps.AvailableOperations() & operations) > 0) { MatCublOps = new MyMatrixCublasOps(callee); } if ((MyMatrixCPUOps.AvailableOperations() & operations) > 0) { MatCPUOps = new MyMatrixCPUOps(callee); } }
public override void Run(MatOperation operation, MyMemoryBlock<float> A, float value, MyMemoryBlock<float> Result) { if ((MyMatrixCublasOps.AvailableOperations() & operation) > 0) { MatCublOps.Run(operation, A, value, Result); } else if ((MyMatrixKernelOps.AvailableOperations() & operation) > 0) { MatKerlOps.Run(operation, A, value, Result); } else { MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run undefined MatOps"); } }
public virtual void UnbindMultiple(MyMemoryBlock<float> firstInput, MyMemoryBlock<float> otherInputs, MyMemoryBlock<float> output) { int nrInputs = otherInputs.Count / m_inputSize; CUdeviceptr firstPtr = firstInput.GetDevicePtr(m_owner); CUdeviceptr start = otherInputs.GetDevicePtr(m_owner); CUdeviceptr[] arr = GetTempArray(nrInputs + 1);//+1 for output for (int i = 0; i <= nrInputs; ++i) { arr[i] = start + i * m_inputSize * sizeof(float); } arr[nrInputs] = output.GetDevicePtr(m_owner); Unbind(firstPtr, arr); }
public static MyMemoryBlock<float> SetupResultSize(MatOperation operation, MyMemoryBlock<float> A, MyMemoryBlock<float> B, MyMemoryBlock<float> Result) { Result.Count = A != null ? A.Count : 1; Result.ColumnHint = A != null ? A.ColumnHint : 1; if (A != null) { if (operation == MatOperation.DotProd) { Result.Count = Result.ColumnHint = 1; } else if (operation == MatOperation.Multiplication) { if (A != null && B != null && A.ColumnHint != 0 && B.Count > 1) { Result.ColumnHint = B.ColumnHint; Result.Count = B.ColumnHint * A.Count / A.ColumnHint; } } else if (operation == MatOperation.GetCol) { Result.Count = A.Count / A.ColumnHint; Result.ColumnHint = Result.Count; } else if (operation == MatOperation.GetRow) { Result.Count = A.ColumnHint; Result.ColumnHint = Result.Count; } else if (B != null && (operation == MatOperation.MultiplElemntWise || operation == MatOperation.Addition)) { Result.ColumnHint = Math.Max(A.ColumnHint, B.ColumnHint); Result.Count = Math.Max(A.Count, B.Count); } else if (operation == MatOperation.Transpose) { if (A.ColumnHint != 0) { Result.ColumnHint = A.Count / A.ColumnHint; } } } return Result; }
public MyDistanceOps(MyWorkingNode caller, DistanceOperation operations, MyMemoryBlock<float> tempBlock = null) { m_caller = caller; m_operations = operations; m_temp = tempBlock; if (operations.HasFlag(DistanceOperation.DotProd)) { m_dotKernel = MyKernelFactory.Instance.KernelProduct<float>(caller, caller.GPU, ProductMode.f_DotProduct_f); } if (operations.HasFlag(DistanceOperation.CosDist)) { m_cosKernel = MyKernelFactory.Instance.KernelProduct<float>(caller, caller.GPU, ProductMode.f_Cosine_f); } if (operations.HasFlag(DistanceOperation.EuclidDist) || operations.HasFlag(DistanceOperation.EuclidDistSquared)) { // EuclidDist computes EuclidDistSquared first, so keep them together: m_operations |= DistanceOperation.EuclidDist | DistanceOperation.EuclidDistSquared; m_dotKernel = MyKernelFactory.Instance.KernelProduct<float>(caller, caller.GPU, ProductMode.f_DotProduct_f); } if (operations.HasFlag(DistanceOperation.HammingDist)) { m_reduceSumKernel = MyKernelFactory.Instance.KernelReduction<float>(caller, caller.GPU, ReductionMode.f_Sum_f); } if (operations.HasFlag(DistanceOperation.HammingSim)) { m_reduceSumKernel = MyKernelFactory.Instance.KernelReduction<float>(caller, caller.GPU, ReductionMode.f_Sum_f); } if (operations.HasFlag(DistanceOperation.EuclidDist) || operations.HasFlag(DistanceOperation.EuclidDistSquared) || operations.HasFlag(DistanceOperation.HammingDist) || operations.HasFlag(DistanceOperation.HammingSim)) { m_combineVecsKernel = MyKernelFactory.Instance.Kernel(m_caller.GPU, @"Common\CombineVectorsKernel", "CombineTwoVectorsKernel"); } }
protected override void updateInput() { base.updateInput(); FileInfo fi = new FileInfo(BuildFile); if (DataInput != lastDataInput) { lastDataInput = DataInput; ParamsChanged = true; } if (m_buildLastFile == null || m_buildLastFile.LastWriteTime != fi.LastWriteTime || !fi.FullName.Equals(m_buildLastFile.FullName)) { m_buildLastFile = fi; ParamsChanged = true; } }
//private CudaBlas cublas = null; public MyMatrixCublasOps(MyWorkingNode callee, MatOperation operation = 0, MyMemoryBlock<float> A = null, MyMemoryBlock<float> tmp = null) { // cublas = new CudaBlas(); this.callee = callee; }
public override void Run(MatOperation operation, MyMemoryBlock<float> A, float value, MyMemoryBlock<float> Result) { Result.Fill(.0f); switch (operation) { case MatOperation.Multiplication: MyCublasFactory.Instance.Axpy(value, A.GetDevice(callee), 1, Result.GetDevice(callee), 1); break; default: MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation"); break; } }
public override void Run(MatOperation operation, MyMemoryBlock<float> A) { switch (operation) { case MatOperation.Minus: MyCublasFactory.Instance.Scale(-1.0f, A.GetDevice(callee), 1); break; case MatOperation.Normalize: float nrm = MyCublasFactory.Instance.Norm2(A.GetDevice(callee), 1); MyCublasFactory.Instance.Scale(1 / nrm, A.GetDevice(callee), 1); break; default: MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation"); break; } }
public override void Run(MatOperation operation, MyMemoryBlock<float> A, MyMemoryBlock<float> Result) { int itmp; Result.Fill(.0f); switch (operation) { case MatOperation.MinIndex: itmp = MyCublasFactory.Instance.Min(A.GetDevice(callee), 1); Result.Fill((float)(itmp - 1)); break; case MatOperation.MaxIndex: itmp = MyCublasFactory.Instance.Max(A.GetDevice(callee), 1); Result.Fill((float)(itmp - 1)); break; case MatOperation.Norm2: MyCublasFactory.Instance.Norm2(A.GetDevice(callee), 1, Result.GetDevice(callee)); break; case MatOperation.Normalize: float nrm = MyCublasFactory.Instance.Norm2(A.GetDevice(callee), 1); MyCublasFactory.Instance.Axpy(1 / nrm, A.GetDevice(callee), 1, Result.GetDevice(callee), 1); break; case MatOperation.Minus: MyCublasFactory.Instance.Axpy(-1.0f, A.GetDevice(callee), 1, Result.GetDevice(callee), 1); break; case MatOperation.Copy: MyCublasFactory.Instance.Copy(A.GetDevice(callee), 1, Result.GetDevice(callee), 1); break; default: MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation"); break; } }
public override void Run(MatOperation operation, MyMemoryBlock<float> A, MyMemoryBlock<float> B, MyMemoryBlock<float> Result) { switch (operation) { case MatOperation.Multiplication: // vectors/matrices have to be always in the correct dimesions! if (A.Count == 1) // valueA * B { Result.Fill(.0f); A.SafeCopyToHost(); MyCublasFactory.Instance.Axpy(A.Host[0], B.GetDevice(callee), 1, Result.GetDevice(callee), 1); } else if (B.Count == 1) // A * valueB { Result.Fill(.0f); B.SafeCopyToHost(); MyCublasFactory.Instance.Axpy(B.Host[0], A.GetDevice(callee), 1, Result.GetDevice(callee), 1); } else // another executions... { Run(operation, A.GetDevice(callee), A.Count, A.ColumnHint, B.GetDevice(callee), B.Count, B.ColumnHint, Result.GetDevice(callee), Result.Count, Result.ColumnHint, 0); } break; case MatOperation.DotProd: Run(operation, A.GetDevice(callee), A.Count, A.ColumnHint, B.GetDevice(callee), B.Count, B.ColumnHint, Result.GetDevice(callee), Result.Count, Result.ColumnHint, 0); break; default: MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation"); break; } }
// Sets up the genetic task public override void Init(int nGPU) { currentGen = 0; m_weights = 0; // Load the relevant kernels m_coeffGenKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Genetic\CosyneGenetics", "generateCoefficients"); m_geneticKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Genetic\CosyneGenetics", "grow"); m_extractKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Genetic\CosyneGenetics", "extractCoeffs"); m_cosineGenKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Genetic\CosyneGenetics", "createCosineMatrix"); m_implantKernel = MyKernelFactory.Instance.Kernel(nGPU, @"Genetic\CosyneGenetics", "implantCoeffs"); // Init the random generator m_rand = new Random(); // Set up coefficient Generation m_coeffGenKernel.SetupExecution(Owner.PopulationSize); // Set up genetic recombination m_geneticKernel.SetupExecution(Owner.PopulationSize); // This finds the first nn group in the network. Possibility of getting a list of networks and evolving them all seperately? List<MyNode> ch = Owner.Owner.Network.Children; foreach (MyNode n in ch) { if (n is MyNeuralNetworkGroup) { nn = n as MyNeuralNetworkGroup; MyLog.INFO.WriteLine("Evolving the layers of node: " + nn.Name); break; } } if (nn == null) { throw new NullReferenceException("There is no top level NeuralNetworkGroup."); } // Construct the layerlist which is to be read from and written to constructLayerList(nn); // This is how big the weight matrix will be arr_size = (int)Math.Ceiling(Math.Sqrt(m_weights)); // Get the relevant execution plan m_executionPlan = Owner.Owner.SimulationHandler.Simulation.ExecutionPlan[0]; #region MemoryBlocks // Initialise the population population = new List<MyMemoryBlock<float>>(); outputPop = new List<MyMemoryBlock<float>>(); for (int i = 0; i < Owner.PopulationSize; i++) { population.Add(new MyMemoryBlock<float>()); population[i].Owner = Owner; population[i].Count = arr_size * arr_size; population[i].AllocateMemory(); outputPop.Add(new MyMemoryBlock<float>()); outputPop[i].Owner = Owner; outputPop[i].Count = arr_size * arr_size; outputPop[i].AllocateMemory(); } // Allocate space to manipulate weight matrices on the device cudaMatrices = new MyMemoryBlock<float>(); cudaMatrices.Owner = Owner; cudaMatrices.Count = arr_size * arr_size * Owner.PopulationSize; cudaMatrices.AllocateDevice(); // Allocate a memory block for the Cosine matrix multiplier = new MyMemoryBlock<float>(); multiplier.Owner = Owner; multiplier.Count = arr_size * arr_size; multiplier.AllocateDevice(); // Fill the cosine Matrices m_cosineGenKernel.SetupExecution(arr_size); m_cosineGenKernel.Run(multiplier, arr_size); // Allocate space needed for chromosomes chromosomePop = new MyMemoryBlock<float>(); chromosomePop.Owner = Owner; if (DirectEvolution) chromosomePop.Count = m_weights * Owner.PopulationSize; else chromosomePop.Count = CoefficientsSaved * Owner.PopulationSize; chromosomePop.AllocateMemory(); // Allocate some space for noise to seed the cuda_rand generator noise = new MyMemoryBlock<float>(); noise.Owner = Owner; noise.Count = Owner.PopulationSize; noise.AllocateMemory(); // Write some noise to the initial array for (int i = 0; i < Owner.PopulationSize; i++) { noise.Host[i] = (float)m_rand.NextDouble() * 100000 + (float)m_rand.NextDouble() * 40; } noise.SafeCopyToDevice(); // Allocate space for the fitnesses fitnesses = new MyMemoryBlock<float>(); fitnesses.Owner = Owner; fitnesses.Count = Owner.PopulationSize; fitnesses.AllocateMemory(); // Allocate some temporary storage tempMB = new MyMemoryBlock<float>(); tempPop = new MyMemoryBlock<float>(); tempMB.Owner = Owner; tempMB.Count = CoefficientsSaved; tempMB.AllocateDevice(); tempPop.Owner = Owner; tempPop.Count = arr_size * arr_size; tempPop.AllocateDevice(); marking = new MyMemoryBlock<int>(); marking.Owner = Owner; marking.Count = CoefficientsSaved * Owner.PopulationSize; marking.AllocateDevice(); #endregion // Check saved Coeffs size if (CoefficientsSaved > m_weights) { MyLog.WARNING.Write("Saving more Coefficients than exist in the weight matrix. Setting to max permissable value\n"); CoefficientsSaved = m_weights; } if (CoefficientsSaved == m_weights) { MyLog.INFO.Write("Saving a coefficient for every weight. Evolving weights directly\n"); DirectEvolution = true; } if (DirectEvolution) CoefficientsSaved = m_weights; // Generate the rest of the population if (DirectEvolution) m_coeffGenKernel.Run(chromosomePop, CoefficientsSaved, noise, Owner.PopulationSize, WeightMagnitude); else m_coeffGenKernel.Run(chromosomePop, CoefficientsSaved, noise, Owner.PopulationSize, Alpha); //Disable Backprop tasks in Network if (nn.GetActiveBackpropTask() != null) { if (!nn.GetActiveBackpropTask().DisableLearning) { MyLog.WARNING.WriteLine("Disabling backprop learning for Neural Network"); nn.GetActiveBackpropTask().DisableLearning = true; } } }
/// <summary> /// Transforms all the vectors stored in <paramref name="vectors"/> to be pair-wise orthonormal using a modified version of the Gram-Schmidt algorithm. /// </summary> /// <param name="vectors">The vectors to orthonormalize.</param> /// <param name="temp">A vector of temporal space.</param> /// <param name="xDim">The length of each vector.</param> /// <param name="yDim">The number of vectors.</param> /// <param name="dotKernel">The kernel to compute a dot product.</param> /// <param name="multKernel">The kernel to compute vector combinations.</param> public static void OrthonormalizeVectors(MyMemoryBlock<float> vectors, MyMemoryBlock<float> temp, int xDim, int yDim, MyProductKernel<float> dotKernel, MyCudaKernel multKernel, int GPU) { int count = xDim * yDim; Debug.Assert(vectors != null && temp != null, "Missing data!"); Debug.Assert(dotKernel != null && multKernel != null, "Missing a kernel!"); Debug.Assert(xDim > 0 && yDim > 0, "Negative matrix dimensions!"); Debug.Assert(vectors.Count >= count, "Too little vectors to orthonormalize!"); Debug.Assert(temp.Count >= xDim, "Too little temp space!"); multKernel.SetupExecution(xDim); for (int i = 0; i < count; i += xDim) { var curr = vectors.GetDevicePtr(GPU, i); // Normalize the current vector { //ZXC dotKernel.Run(temp, 0, curr, curr, xDim, /* distributed: */ 0); dotKernel.Run(temp, curr, curr, xDim); temp.SafeCopyToDevice(0, 1); if (temp.Host[0] < 0.0000001f) continue; temp.Host[0] = (float)(1 / Math.Sqrt(temp.Host[0])); temp.SafeCopyToDevice(0, 1); multKernel.Run(curr, temp, curr, (int)MyJoin.MyJoinOperation.Multiplication, xDim, 1); } // Make all the remaining vectors orthogonal to the current one for (int j = i + xDim; j < count; j += xDim) { var next = vectors.GetDevicePtr(GPU, j); // Compute and subtract the projection onto the current vector //ZXC dotKernel.Run(temp, xDim, curr, next, xDim, /* distributed: */ 0); dotKernel.outOffset = xDim; dotKernel.Run(temp, curr, next, xDim); multKernel.Run(curr, temp, temp, (int)MyJoin.MyJoinOperation.Multiplication, xDim, 1); multKernel.Run(next, temp, next, (int)MyJoin.MyJoinOperation.Subtraction, xDim, xDim); } } }
/// <summary> /// Normalizes vectors along the leading dimension. /// </summary> public static void NormalizeLeadingDim( MyMemoryBlock<float> vectors, MyMemoryBlock<float> temp, int leadingDim, int otherDim, MyProductKernel<float> dotKernel, MyCudaKernel multKernel, int GPU) { var count = leadingDim * otherDim; Debug.Assert(vectors != null && temp != null, "Missing data!"); Debug.Assert(dotKernel != null && multKernel != null, "Missing kernels."); Debug.Assert(leadingDim > 0 && otherDim > 0, "Negative matrix dimensions!"); Debug.Assert(vectors.Count >= count, "Too little vectors to orthonormalize!"); Debug.Assert(temp.Count >= Math.Max(leadingDim, otherDim), "Too little temp space!"); multKernel.SetupExecution(leadingDim); for (int i = 0; i < otherDim; i++) { var seg = vectors.GetDevicePtr(GPU, i * leadingDim); //dotKernel.Run(temp, i, seg, seg, leadingDim, /* distributed: */ 0); dotKernel.outOffset = i; dotKernel.Run(temp, seg, seg, leadingDim); } temp.SafeCopyToHost(0, otherDim); for (int i = 0; i < otherDim; i++) { if (temp.Host[i] < 0.0000001f) temp.Host[i] = 0; else temp.Host[i] = (float)(1 / Math.Sqrt(temp.Host[i])); } temp.SafeCopyToDevice(0, otherDim); for (int i = 0; i < otherDim; i++) { var seg = vectors.GetDevicePtr(GPU, i * leadingDim); var len = temp.GetDevicePtr(GPU, i); multKernel.Run(seg, len, seg, (int)MyJoin.MyJoinOperation.Multiplication, leadingDim, 1); } }
/// <summary> /// Generates a matrix with <paramref name="xDim"/> being the leading dimension in column-major storage. /// </summary> /// <param name="unmanagedVectors">A memory block to store the generated matrix. /// Must be as large as <paramref name="xDim"/> x <paramref name="yDim"/>.</param> /// <param name="unmanagedBaseVectors">A temporary block to store all the base vectors. /// Must be as large as Max(<paramref name="xDim"/>, <paramref name="yDim"/>)^2. /// Only neccessary when <paramref name="mode"/> is set to <see cref="VectorGenerationMode.AverageBaseVectors"/>.</param> /// <param name="temp">The temporary storage. It should be as long as the longer of the dimensions.</param> /// <param name="random">The random object for number generation.</param> /// <param name="xDim">The size of the other dimension.</param> /// <param name="yDim">The size of the leading dimension.</param> /// <param name="mode">If true, the vectors along the longer dimension will be orthonormalized.</param> /// <param name="axisToNormalize">The axis along which to normalize vectors after orthonormalization.</param> public static void GenerateTransformMatrix( MyMemoryBlock<float> unmanagedVectors, MyMemoryBlock<float> unmanagedBaseVectors, MyMemoryBlock<float> temp, Random random, int xDim, int yDim, MyProductKernel<float> dotKernel, MyCudaKernel multKernel, MyCudaKernel transposeKernel, int GPU, VectorGenerationMode mode = VectorGenerationMode.Normal, AxisToNormalizeEnum axisToNormalize = AxisToNormalizeEnum.yDim) { Debug.Assert(random != null, "Missing random object"); Debug.Assert(unmanagedVectors != null && (mode != VectorGenerationMode.AverageBaseVectors || unmanagedBaseVectors != null) && temp != null, "Missing data!"); Debug.Assert(dotKernel != null && multKernel != null && transposeKernel != null, "Missing a kernel!"); // Mapping to rows --- Column-major storage --- rows will the leading dimension // The larger dimension vectors will be orthogonal; the cols dimension vectors will be normalized switch (mode) { case VectorGenerationMode.Normal: if (axisToNormalize == AxisToNormalizeEnum.xDim) { // Generate normalized vectors with xDim as the leading dim GenerateRandomNormalVectors(unmanagedVectors.Host, random, xDim, yDim); unmanagedVectors.SafeCopyToDevice(); // Transpose to the correct position transposeKernel.Run(unmanagedVectors, unmanagedVectors, xDim, yDim); } else { GenerateRandomNormalVectors(unmanagedVectors.Host, random, yDim, xDim); unmanagedVectors.SafeCopyToDevice(); } break; case VectorGenerationMode.Orthonormalize: int largerDim = Math.Max(xDim, yDim); int smallerDim = Math.Min(xDim, yDim); // Generate vectors with larger leading dimension GenerateRandomNormalVectors(unmanagedVectors.Host, random, largerDim, smallerDim, normalize: false); unmanagedVectors.SafeCopyToDevice(); // Orthonormalize along the larger dimension OrthonormalizeVectors(unmanagedVectors, temp, largerDim, smallerDim, dotKernel, multKernel, GPU); if (xDim > yDim) { // xDim is leading and is normalized // We need to transpose to get the correct dims transposeKernel.Run(unmanagedVectors, unmanagedVectors, xDim, yDim); if (axisToNormalize == AxisToNormalizeEnum.yDim) NormalizeLeadingDim(unmanagedVectors, temp, yDim, xDim, dotKernel, multKernel, GPU); } else { // yDim is leading and is normalized // The matrix is in correct position if (axisToNormalize == AxisToNormalizeEnum.xDim) { // TODO: generate the matrix with transposed dims? // TODO: SMELLY VERSION: transposeKernel.Run(unmanagedVectors, unmanagedVectors, yDim, xDim); NormalizeLeadingDim(unmanagedVectors, temp, xDim, yDim, dotKernel, multKernel, GPU); transposeKernel.Run(unmanagedVectors, unmanagedVectors, xDim, yDim); } } break; case VectorGenerationMode.AverageBaseVectors: int longerDim = Math.Max(xDim, yDim); int shorterDim = Math.Min(xDim, yDim); GenerateTransformMatrix( unmanagedBaseVectors, null, temp, random, longerDim, longerDim, dotKernel, multKernel, transposeKernel, GPU, VectorGenerationMode.Orthonormalize); if (shorterDim == longerDim) break; float it = 0f; float step = longerDim / (float)shorterDim; int beg, end = 0; for (int i = 0; i < shorterDim; i++) { beg = end; it += step; end = (int)it; var vect = unmanagedVectors.GetDevicePtr(GPU, i * longerDim); for (int j = beg; j < end; j++) { var baseVect = unmanagedBaseVectors.GetDevicePtr(GPU, j * longerDim); multKernel.Run(baseVect, vect, vect, (int)MyJoin.MyJoinOperation.Addition, longerDim, longerDim); } } if (xDim > yDim) { // xDim is leading and is not normalized // We need to transpose to get the correct dims if (axisToNormalize == AxisToNormalizeEnum.xDim) { NormalizeLeadingDim(unmanagedVectors, temp, xDim, yDim, dotKernel, multKernel, GPU); transposeKernel.Run(unmanagedVectors, unmanagedVectors, xDim, yDim); } else { transposeKernel.Run(unmanagedVectors, unmanagedVectors, xDim, yDim); NormalizeLeadingDim(unmanagedVectors, temp, yDim, xDim, dotKernel, multKernel, GPU); } } else { // yDim is leading and is not normalized // The matrix is in correct position if (axisToNormalize == AxisToNormalizeEnum.yDim) NormalizeLeadingDim(unmanagedVectors, temp, yDim, xDim, dotKernel, multKernel, GPU); else { // TODO: SMELLY VERSION: transposeKernel.Run(unmanagedVectors, unmanagedVectors, yDim, xDim); NormalizeLeadingDim(unmanagedVectors, temp, xDim, yDim, dotKernel, multKernel, GPU); transposeKernel.Run(unmanagedVectors, unmanagedVectors, xDim, yDim); } } break; } }
public override void AllocateMemory() { base.AllocateMemory(); m_weightChange = m_weight; m_biasChange = m_bias; m_weightBlock = m_network.WeightsMemoryBlock; m_weightOffset = m_network.WeightsMemoryBlock.Count; m_network.WeightsMemoryBlock.Count += m_weight.Count; m_weightDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; m_weightChangeBlock = m_network.WeightChangesMemoryBlock; m_weightChangeOffset = m_network.WeightChangesMemoryBlock.Count; m_network.WeightChangesMemoryBlock.Count += m_weightChange.Count; m_weightChangeDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; m_biasBlock = m_network.WeightsMemoryBlock; m_biasOffset = m_network.WeightsMemoryBlock.Count; m_network.WeightsMemoryBlock.Count += m_bias.Count; m_biasDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; m_biasChangeBlock = m_network.WeightChangesMemoryBlock; m_biasChangeOffset = m_network.WeightChangesMemoryBlock.Count; m_network.WeightChangesMemoryBlock.Count += m_biasChange.Count; m_biasChangeDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; m_lastWeightDeltaBlock = m_network.WeightChangesMemoryBlock; m_lastWeightDeltaOffset = m_network.WeightChangesMemoryBlock.Count; m_network.WeightChangesMemoryBlock.Count += m_lastWeightDelta.Count; m_lastWeightDeltaDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; m_storedOutputBlock = m_network.WeightChangesMemoryBlock; m_storedOutputOffset = m_network.WeightChangesMemoryBlock.Count; m_network.WeightChangesMemoryBlock.Count += m_storedOutput.Count; m_storedOutputDimGPUPtrOffset = m_network.DataDimsMemoryBlock.Count; m_network.DataDimsMemoryBlock.Count++; }
internal void RBMBackward(MyMemoryBlock<float> PreviousLayerBias, float SigmoidSteepness) { MyLog.DEBUG.WriteLine("RBM backward from " + Name); m_RBMBackwardKernel.SetupExecution(Input.Count); m_RBMBackwardKernel.Run( Output, Input, Weights, PreviousLayerBias, SigmoidSteepness, Input.Count, Neurons ); }