Example #1
0
        protected override void Reset()
        {
            List <String> actions = Target.GetActionLabels();

            m_StringDeviceBuffer = new CudaDeviceVariable <float>(1000);
            m_StringDeviceBuffer.Memset(0);

            if (numOfActions < actions.Count)
            {
                if (m_actionLabels != null)
                {
                    m_actionLabels.Dispose();
                }

                m_actionLabels = new CudaDeviceVariable <uint>(actions.Count * LABEL_PIXEL_WIDTH * LABEL_PIXEL_WIDTH);
                m_actionLabels.Memset(0);

                for (int i = 0; i < actions.Count; i++)
                {
                    MyDrawStringHelper.String2Index(actions[i], m_StringDeviceBuffer);
                    MyDrawStringHelper.DrawStringFromGPUMem(
                        m_StringDeviceBuffer, i * LABEL_PIXEL_WIDTH + 5, 8, 0, 0xFFFFFFFF,
                        m_actionLabels.DevicePointer, LABEL_PIXEL_WIDTH * actions.Count, LABEL_PIXEL_WIDTH, 0, actions[i].Length);
                }

                numOfActions = actions.Count;
            }
            Target.ReadTwoDimensions(ref m_qMatrix, ref m_qMatrixActions, XAxisVariableIndex, YAxisVariableIndex, ApplyInnerScaling);

            if (MatrixSizeOK())
            {
                DrawDataToGpu();
            }
        }
        protected override void Reset()
        {
            List <MyMotivatedAction> actions = Target.Rds.ActionManager.Actions;

            if (numOfActions < actions.Count)
            {
                if (m_actionLabels != null)
                {
                    m_actionLabels.Dispose();
                }

                m_actionLabels = new CudaDeviceVariable <uint>(actions.Count * LABEL_PIXEL_WIDTH * LABEL_PIXEL_WIDTH);
                m_actionLabels.Memset(0);

                for (int i = 0; i < actions.Count; i++)
                {
                    MyDrawStringHelper.DrawString(actions[i].GetLabel(), i * LABEL_PIXEL_WIDTH + 5, 8, 0, 0xFFFFFFFF, m_actionLabels.DevicePointer, LABEL_PIXEL_WIDTH * actions.Count, LABEL_PIXEL_WIDTH);
                }

                numOfActions = actions.Count;
            }
            Target.Vis.ReadTwoDimensions(ref m_qMatrix, ref m_qMatrixActions, XAxisVariableIndex, YAxisVariableIndex, ShowCurrentMotivations);

            if (MatrixSizeOK())
            {
                DrawDataToGpu();
            }
        }
Example #3
0
        public ConvolutionalLayerNPP(int widthIn, int heightIn, int channelsIn, int widthOut, int heightOut, int channelsOut, int batch, int filterWidth, int filterHeight, Activation activation, CudaContext ctx, CUmodule module)
            : base(widthIn, heightIn, channelsIn, widthOut, heightOut, channelsOut, batch)
        {
            _activation      = activation;
            _filterX         = filterWidth;
            _filterY         = filterHeight;
            _weights         = new CudaDeviceVariable <float>(filterWidth * filterHeight * channelsIn * channelsOut);
            _bias            = new CudaDeviceVariable <float>(channelsOut);
            _y               = new CudaDeviceVariable <float>(widthOut * heightOut * channelsOut * batch);
            _z               = new CudaDeviceVariable <float>(widthOut * heightOut * channelsOut * batch);
            _tempConvolution = new CudaDeviceVariable <float>(widthOut * heightOut);


            if (_activation == Activation.PRelu || _activation == Activation.LeakyRelu)
            {
                _aRelu = new CudaDeviceVariable <float>(channelsOut);
                _KernelPReluForward = new PReluForwardKernel(module, ctx);
                _KernelPReluForward.SetComputeSize((uint)widthOut * (uint)heightOut, (uint)channelsOut, (uint)batch);
            }
            else
            if (_activation == Activation.Relu)
            {
                _aRelu = new CudaDeviceVariable <float>(channelsOut);
                _aRelu.Memset(0);     //use a fixed alpha of 0 for Relu...
                _KernelPReluForward = new PReluForwardKernel(module, ctx);
                _KernelPReluForward.SetComputeSize((uint)widthOut * (uint)heightOut, (uint)channelsOut, (uint)batch);
            }
        }
Example #4
0
        public void FillAll(float value)
        {
            CudaDeviceVariable <T> rDeviceVar = GetDevice(Owner.GPU);
            CudaDeviceVariable <T> rTimeOffsettedDeviceVar = new CudaDeviceVariable <T>(rDeviceVar.DevicePointer, false, BoundedSequenceLength * GetSize());

            rTimeOffsettedDeviceVar.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(value), 0));
        }
 protected override void Reset()
 {
     m_StringDeviceBuffer = new CudaDeviceVariable <float>(1000);
     m_StringDeviceBuffer.Memset(0);
     TextureWidth  = Target.AttentionMap.ColumnHint;
     TextureHeight = Target.AttentionMap.Count / TextureWidth;
 }
Example #6
0
        public override void Fill(bool value)
        {
            CudaDeviceVariable <T> rDeviceVar              = GetDevice(Owner.GPU);
            CUdeviceptr            timeOffsettedPtr        = rDeviceVar.DevicePointer + TimeOffset * rDeviceVar.TypeSize;
            CudaDeviceVariable <T> rTimeOffsettedDeviceVar = new CudaDeviceVariable <T>(timeOffsettedPtr, false, GetSize());

            rTimeOffsettedDeviceVar.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(value), 0));
        }
        protected override void Execute()
        {
            if (SimulationStep % UPDATE_STEP == 0 || SimulationStep == 1)
            {
                m_computeHistogram.DynamicSharedMemory = (uint)((int)sizeof(int) * BINS);

                m_computeHistogram.SetConstantVariable("D_BINS", BINS);
                m_computeHistogram.SetConstantVariable("D_MAX_VALUE", MAX_VALUE);
                m_computeHistogram.SetConstantVariable("D_MIN_VALUE", MIN_VALUE);
                m_computeHistogram.SetConstantVariable("D_BIN_VALUE_WIDTH", BIN_VALUE_WIDTH);
                m_computeHistogram.SetConstantVariable("D_MEMORY_BLOCK_SIZE", Target.Count);

                m_d_HistogramData.Memset(0);
                m_computeHistogram.SetupExecution(
                    Target.Count
                    );

                m_computeHistogram.Run(
                    Target,
                    m_d_HistogramData.DevicePointer
                    );

                m_visualizeHistogram.SetConstantVariable("D_BINS", BINS);
                m_visualizeHistogram.SetConstantVariable("D_BIN_PIXEL_WIDTH", BIN_PIXEL_WIDTH);
                m_visualizeHistogram.SetConstantVariable("D_BIN_PIXEL_HEIGHT", BIN_PIXEL_HEIGHT);
                m_visualizeHistogram.SetConstantVariable("D_COLOR_ONE", COLOR_ONE);
                m_visualizeHistogram.SetConstantVariable("D_COLOR_TWO", COLOR_TWO);
                m_visualizeHistogram.SetConstantVariable("D_COLOR_BACKGROUND", BACKGROUND);
                m_visualizeHistogram.SetConstantVariable("D_OUT_OF_BOUNDS", OUT_OF_BOUNDS);

                m_visualizeHistogram.SetupExecution(
                    new dim3(BIN_PIXEL_WIDTH, 1, 1),
                    new dim3(BINS, 1, 1)
                    );

                m_visualizeHistogram.Run(
                    m_d_HistogramData.DevicePointer,
                    VBODevicePointer
                    );

                if (SET_BOUNDARIES == Option.True)
                {
                    Target.SafeCopyToHost();
                    float min = (Target as MyMemoryBlock <float>).Host.Min();
                    float max = (Target as MyMemoryBlock <float>).Host.Max();
                    if (min == max)
                    {
                        max = min + 1.00f;
                    }
                    MAX_VALUE      = max;
                    MIN_VALUE      = min;
                    SET_BOUNDARIES = Option.False;
                    TriggerReset();
                }
            }
        }
Example #8
0
        protected override void Reset()
        {
            base.Reset();



            // Allocate the buffer
            m_deviceBuffer = new CudaDeviceVariable <float>(m_Rows * m_cols);
            m_deviceBuffer.Memset(0);
        }
Example #9
0
        protected override void Reset()
        {
            TextureWidth  = Target.MaskCols;
            TextureHeight = Target.MaskCount / TextureWidth;

            int n_eye_examples2store = 500;

            EyeMovementPathData = new CudaDeviceVariable <float>(n_eye_examples2store * 2 + 1);
            EyeMovementPathData.Memset(0);
        }
        protected override void Reset()
        {
            TextureHeight = BIN_PIXEL_HEIGHT;
            TextureWidth  = BIN_PIXEL_WIDTH * BINS;

            if (m_d_HistogramData != null)
            {
                m_d_HistogramData.Dispose();
            }
            m_d_HistogramData = new CudaDeviceVariable <int>(BINS);
            m_d_HistogramData.Memset(0);
        }
Example #11
0
        protected override void Reset()
        {
            base.Reset();

            isScreenClear = false;

            // Allocate the history
            m_HistoryDeviceBuffer = new CudaDeviceVariable <float>(m_Rows * m_Cols);
            m_HistoryDeviceBuffer.Memset(0);

            m_History = new List <string>();
            m_History.Add("");
        }
Example #12
0
        public void Run(MatOperation operation, CudaDeviceVariable <float> A, int ACount, int AColumnHint, CudaDeviceVariable <float> B, int BCount, int BColumnHint, CudaDeviceVariable <float> Result, int ResultCount, int ResultColumnHint, float beta = 1.0f)
        {
            Result.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0.0f), 0));

            switch (operation)
            {
            case MatOperation.Multiplication:                                                                                        // vectors/matrices have to be always in the correct dimesions!
                if (BCount > 1 && ACount > 1 && BColumnHint == 1 && ACount / AColumnHint > 1 && BCount / BColumnHint == AColumnHint) //. A*vecB
                {
                    MyCublasFactory.Instance.Gemv(Operation.Transpose,                                                               // transpose beacuase it does Ax row wise if x is a row vector :D
                                                  AColumnHint, ACount / AColumnHint, 1.0f,
                                                  A, AColumnHint,
                                                  B, 1,
                                                  beta, Result, 1);
                }
                else if (ACount > 1 && BCount > 1 && ACount / AColumnHint == 1 && BColumnHint > 1 && BCount / BColumnHint == AColumnHint) // vecA*B
                {
                    MyCublasFactory.Instance.Gemv(Operation.NonTranspose,                                                                 // transpose beacuase it does Ax row wise if x is a row vector :D
                                                  BColumnHint, BCount / BColumnHint, 1.0f,
                                                  B, BColumnHint,
                                                  A, 1,
                                                  beta, Result, 1);
                }
                else if (ACount / AColumnHint == 1 && BColumnHint == 1 && ACount > 1 && BCount > 1)     //. trans(vecA) * vecB
                {
                    Run(MatOperation.DotProd, A, ACount, AColumnHint, B, BCount, BColumnHint, Result, ResultCount, ResultColumnHint, beta);
                }
                else if (ACount != 1 || BCount != 1)    // A*B   matrix multiplication
                {
                    MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.NonTranspose,
                                                  ACount / AColumnHint, BColumnHint, AColumnHint, 1.0f,
                                                  A, ACount / AColumnHint,
                                                  B, BCount / BColumnHint,
                                                  beta, Result, ResultColumnHint);
                }
                break;

            case MatOperation.DotProd:
                MyCublasFactory.Instance.Gemv(Operation.Transpose,      // transpose beacuase it does Ax row wise if x is a row vector :D
                                              ACount, 1, 1.0f,
                                              A, ACount,
                                              B, 1,
                                              beta, Result, 1);
                break;

            default:
                MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation");
                break;
            }
        }
        public void Run(MatOperation operation, CudaDeviceVariable<float> A, int ACount, int AColumnHint, CudaDeviceVariable<float> B, int BCount, int BColumnHint, CudaDeviceVariable<float> Result, int ResultCount, int ResultColumnHint, float beta = 1.0f)
        {
            Result.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0.0f), 0));

            switch (operation)
            {
                case MatOperation.Multiplication:  // vectors/matrices have to be always in the correct dimesions!
                    if (BCount > 1 && ACount > 1 && BColumnHint == 1 && ACount / AColumnHint > 1 && BCount / BColumnHint == AColumnHint) //. A*vecB
                    {
                        MyCublasFactory.Instance.Gemv(Operation.Transpose,  // transpose beacuase it does Ax row wise if x is a row vector :D
                            AColumnHint, ACount / AColumnHint, 1.0f,
                            A, AColumnHint,
                            B, 1,
                            beta, Result, 1);
                    }
                    else if (ACount > 1 && BCount > 1 && ACount / AColumnHint == 1 && BColumnHint > 1 && BCount / BColumnHint == AColumnHint)  // vecA*B
                    {
                        MyCublasFactory.Instance.Gemv(Operation.NonTranspose,  // transpose beacuase it does Ax row wise if x is a row vector :D
                            BColumnHint, BCount / BColumnHint, 1.0f,
                            B, BColumnHint,
                            A, 1,
                            beta, Result, 1);
                    }
                    else if (ACount / AColumnHint == 1 && BColumnHint == 1 && ACount > 1 && BCount > 1) //. trans(vecA) * vecB
                    {
                        Run(MatOperation.DotProd, A, ACount, AColumnHint, B, BCount, BColumnHint, Result, ResultCount, ResultColumnHint, beta);
                    }
                    else if (ACount != 1 || BCount != 1)// A*B   matrix multiplication
                    {
                        MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.NonTranspose,
                            ACount / AColumnHint, BColumnHint, AColumnHint, 1.0f,
                            A, ACount / AColumnHint,
                            B, BCount / BColumnHint,
                            beta, Result, ResultColumnHint);
                    }
                    break;
                case MatOperation.DotProd:
                    MyCublasFactory.Instance.Gemv(Operation.Transpose,  // transpose beacuase it does Ax row wise if x is a row vector :D
                       ACount, 1, 1.0f,
                       A, ACount,
                       B, 1,
                       beta, Result, 1);
                    break;
                default:
                    MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation");
                    break;
            }
        }
Example #14
0
 internal MinMax FindMinAndMax(CudaDeviceVariable <float> a, int size)
 {
     if (size > 0)
     {
         var ptr = a;
         while (size > BLOCK_DIM2)
         {
             var bufferSize = (size / BLOCK_DIM2) + 1;
             using (var minBlock = new CudaDeviceVariable <float>(bufferSize))
                 using (var maxBlock = new CudaDeviceVariable <float>(bufferSize)) {
                     minBlock.Memset(0);
                     maxBlock.Memset(0);
                     _Use(_findMinAndMax, size, k => k.Run(BLOCK_DIM2, ptr.DevicePointer, size, minBlock.DevicePointer, maxBlock.DevicePointer));
                     if (ptr != a)
                     {
                         ptr.Dispose();
                     }
                     var minTest = new float[bufferSize];
                     var maxText = new float[bufferSize];
                     minBlock.CopyToHost(minTest);
                     maxBlock.CopyToHost(maxText);
                     size = bufferSize * 2;
                     ptr  = new CudaDeviceVariable <float>(size);
                     ptr.CopyToDevice(minBlock, 0, 0, bufferSize * sizeof(float));
                     ptr.CopyToDevice(maxBlock, 0, bufferSize * sizeof(float), bufferSize * sizeof(float));
                     var test = new float[size];
                     ptr.CopyToHost(test);
                 }
         }
         var data = new float[size];
         ptr.CopyToHost(data);
         float min = float.MaxValue, max = float.MinValue;
         for (var i = 0; i < size; i++)
         {
             var val = data[i];
             if (val > max)
             {
                 max = val;
             }
             if (val < min)
             {
                 min = val;
             }
         }
         return(new MinMax(min, max));
     }
     return(MinMax.Empty);
 }
Example #15
0
        protected override void Reset()
        {
            m_StringDeviceBuffer = new CudaDeviceVariable <float>(1000);
            m_StringDeviceBuffer.Memset(0);

            List <String> actions = Target.GetActionLabels();

            if (numOfActions < actions.Count)
            {
                if (m_actionLabels != null)
                {
                    m_actionLabels.Dispose();
                }

                m_actionLabels = new CudaDeviceVariable <uint>(actions.Count * LABEL_PIXEL_WIDTH * LABEL_PIXEL_WIDTH);
                m_actionLabels.Memset(0);

                for (int i = 0; i < actions.Count; i++)
                {
                    MyDrawStringHelper.String2Index(actions[i], m_StringDeviceBuffer);
                    MyDrawStringHelper.DrawStringFromGPUMem(m_StringDeviceBuffer, i * LABEL_PIXEL_WIDTH + 5, 8, 0, 0xFFFFFFFF, m_actionLabels.DevicePointer, LABEL_PIXEL_WIDTH * actions.Count, LABEL_PIXEL_WIDTH, 0, actions[i].Length);
                }

                numOfActions = actions.Count;
            }

            MyStochasticReturnPredictor srp = null;

            srp = (MyStochasticReturnPredictor)Target.Vis.GetPredictorNo(AbstractActionIndex);

            if (srp == null)
            {
                m_qMatrix     = null;
                TextureWidth  = 0;
                TextureHeight = 0;
            }
            else
            {
                Target.ReadTwoDimensions(ref m_qMatrix, ref m_qMatrixActions, XAxisVariableIndex, YAxisVariableIndex, ApplyInnerScaling, AbstractActionIndex);

                if (MatrixSizeOK())
                {
                    DrawDataToGpu();
                }
            }
        }
        protected override void Reset()
        {
            base.Reset();

            m_StringDeviceBuffer = new CudaDeviceVariable <float>(1000);
            m_StringDeviceBuffer.Memset(0);

            switch (DisplayMethod)
            {
            case MyDisplayMethod.CYCLE:
                m_cycleKernel        = MyKernelFactory.Instance.Kernel(@"Observers\PlotObserverCycleKernel", true);
                m_verticalLineKernel = MyKernelFactory.Instance.Kernel(@"Observers\VerticalLineKernel", true);
                break;

            case MyDisplayMethod.SCALE:
                m_scaleFactor  = 1;
                m_scaleAverage = new float[Count];
                for (int i = 0; i < Count; i++)
                {
                    m_scaleAverage[i] = 0;
                }
                m_scaleAverageWeight   = 0;
                m_nbValuesSaved        = 0;
                m_scaleKernel          = MyKernelFactory.Instance.Kernel(@"Observers\PlotObserverScaleKernel", true);
                m_scaleDownScaleKernel = MyKernelFactory.Instance.Kernel(@"Observers\PlotObserverScaleDownScaleKernel", true);
                break;

            case MyDisplayMethod.SCROLL:
                m_scrollKernel      = MyKernelFactory.Instance.Kernel(@"Observers\PlotObserverScrollKernel", true);
                m_scrollShiftKernel = MyKernelFactory.Instance.Kernel(@"Observers\PlotObserverScrollShiftKernel", true);
                break;

            default:
                break;
            }

            m_plotAreaWidth           = TextureWidth - m_plotAreaOffsetX;
            m_plotAreaHeight          = TextureHeight - m_plotAreaOffsetY;
            m_isDirty                 = true;
            m_currentRealTimeStep     = 0;
            m_currentSamplingTimeStep = 0;
            UpdateColorsToGpu();
            updateHistoryBuffer();
        }
Example #17
0
        protected override void Reset()
        {
            List <MyMotivatedAction> actions = Target.Rds.ActionManager.Actions;

            if (numOfActions < actions.Count)
            {
                if (m_actionLabels != null)
                {
                    m_actionLabels.Dispose();
                }

                m_actionLabels = new CudaDeviceVariable <uint>(actions.Count * LABEL_PIXEL_WIDTH * LABEL_PIXEL_WIDTH);
                m_actionLabels.Memset(0);

                for (int i = 0; i < actions.Count; i++)
                {
                    MyDrawStringHelper.DrawString(actions[i].GetLabel(), i * LABEL_PIXEL_WIDTH + 5, 8, 0, 0xFFFFFFFF, m_actionLabels.DevicePointer, LABEL_PIXEL_WIDTH * actions.Count, LABEL_PIXEL_WIDTH);
                }

                numOfActions = actions.Count;
            }

            MyStochasticReturnPredictor srp = null;

            if (AbstractActionIndex < Target.Rds.VarManager.MAX_VARIABLES)
            {
                srp = (MyStochasticReturnPredictor)Target.Vis.GetPredictorNo(AbstractActionIndex);
            }

            if (srp == null)
            {
                m_qMatrix     = null;
                TextureWidth  = 0;
                TextureHeight = 0;
            }
            else
            {
                Target.Vis.ReadTwoDimensions(ref m_qMatrix, ref m_qMatrixActions, srp, XAxisVariableIndex, YAxisVariableIndex, ShowCurrentMotivations);
                if (MatrixSizeOK())
                {
                    DrawDataToGpu();
                }
            }
        }
Example #18
0
        private void drawCoordinates()
        {
            m_canvas.Memset(COLOR_BACKGROUND);

            // Ordinates
            double range            = m_plotCurrentValueMax - m_plotCurrentValueMin;
            double scale            = Math.Floor(Math.Log10(range));
            double unit             = Math.Pow(10, scale) / 2;
            int    displayPrecision = (scale >= 1) ? 0 : (1 - (int)scale);
            double firstOrdinate    = Math.Ceiling(m_plotCurrentValueMin / unit) * unit;

            for (int n = 0; firstOrdinate + n * unit < m_plotCurrentValueMax; n++)
            {
                double value    = firstOrdinate + n * unit;
                string valueStr = string.Format("{0,8:N" + displayPrecision + "}", value);
                double y        = TextureHeight - m_plotAreaOffsetY - m_plotAreaHeight * (value - m_plotCurrentValueMin) / range - MyDrawStringHelper.CharacterHeight / 2;
                MyDrawStringHelper.DrawString(valueStr, 0, (int)y, COLOR_BACKGROUND, COLOR_FONT, VBODevicePointer, TextureWidth, TextureHeight);
            }
        }
        protected override void Execute()
        {
            float alpha = Target.Output.Host[0];
            float beta  = Target.Output.Host[1];
            float xmin  = (float)Math.Floor(Target.XData.Host.Min());
            float xmax  = (float)Math.Ceiling(Target.XData.Host.Max());
            float ymin  = (float)Math.Floor(Target.YData.Host.Min());
            float ymax  = (float)Math.Ceiling(Target.YData.Host.Max());

            float xscale = Size / (xmax - xmin);
            float yscale = Size / (ymax - ymin);
            //float scale = Math.Min(xscale, yscale);

            CudaDeviceVariable <float> VBOvar = new CudaDeviceVariable <float>(VBODevicePointer);

            VBOvar.Memset(0xFFFFFFFF);  //fill white


            m_kernel.SetConstantVariable("D_ALPHA", alpha);
            m_kernel.SetConstantVariable("D_BETA", beta);
            //m_kernel.SetConstantVariable("D_SCALE", scale);
            m_kernel.SetConstantVariable("D_XSCALE", xscale);
            m_kernel.SetConstantVariable("D_YSCALE", yscale);
            m_kernel.SetConstantVariable("D_XMIN", xmin);
            m_kernel.SetConstantVariable("D_YMIN", ymin);
            m_kernel.SetConstantVariable("D_SIZE", Size);

            m_kernel.SetupExecution(Target.ValidFields);
            m_kernel.Run(Target.XData, Target.YData, VBODevicePointer, Target.ValidFields);

            m_lineKernel.SetConstantVariable("D_K", beta);
            m_lineKernel.SetConstantVariable("D_Q", alpha);
            //m_lineKernel.SetConstantVariable("D_SCALE", scale);
            m_lineKernel.SetConstantVariable("D_XSCALE", xscale);
            m_lineKernel.SetConstantVariable("D_YSCALE", yscale);
            m_lineKernel.SetConstantVariable("D_XMIN", xmin);
            m_lineKernel.SetConstantVariable("D_YMIN", ymin);
            m_lineKernel.SetConstantVariable("D_SIZE", Size);

            m_lineKernel.SetupExecution(Size);
            m_lineKernel.Run(VBODevicePointer, Size);
        }
Example #20
0
        protected T[] InternalExecuteCuda <T>(
            byte[] kernelBinary,
            String function,
            int bufferSize,
            ParallelTaskParams loaderParams,
            params Object[] kernelParams) where T : struct
        {
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointStart);

            CudaContext context = ContextWithDevice(loaderParams.CudaDevice);

            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformInit);
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelBuild);

            CudaDeviceVariable <T> resultBufferVar = new CudaDeviceVariable <T>(bufferSize);

            resultBufferVar.Memset(0);

            List <Tuple <Object, IDisposable> > vars = new List <Tuple <Object, IDisposable> >();

            vars.Add(new Tuple <Object, IDisposable>(resultBufferVar.DevicePointer, resultBufferVar));
            vars.AddRange(WrapDeviceVariables(kernelParams, true));
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceWrite);

            CudaKernel kernel = context.LoadKernelPTX(kernelBinary, function);

            kernel.BlockDimensions = new dim3(loaderParams.BlockSize.Width, loaderParams.BlockSize.Height);
            kernel.GridDimensions  = new dim3(loaderParams.GridSize.Width, loaderParams.GridSize.Height);
            kernel.Run(vars.Select(tuple => tuple.Item1).ToArray());
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelExecute);

            T[] resultBuffer = resultBufferVar;
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceRead);

            vars.Where(tuple => tuple.Item2 != null).ToList().ForEach(tuple => tuple.Item2.Dispose());
            TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformDeinit);

            return(resultBuffer);
        }
Example #21
0
        internal IReadOnlyList <CudaDeviceVariable <float> > TensorAddPadding(IReadOnlyList <CudaDeviceVariable <float> > matrixList, int rows, int columns, int padding)
        {
            int depth      = matrixList.Count;
            var newRows    = rows + padding * 2;
            var newColumns = columns + padding * 2;
            var ret        = new List <CudaDeviceVariable <float> >();

            for (var i = 0; i < depth; i++)
            {
                var buffer = new CudaDeviceVariable <float>(newRows * newColumns);
                buffer.Memset(0);
                ret.Add(buffer);
            }

            using (var outputDevicePtr = new CudaDeviceVariable <CUdeviceptr>(depth))
                using (var inputDevicePtr = new CudaDeviceVariable <CUdeviceptr>(depth)) {
                    inputDevicePtr.CopyToDevice(matrixList.Select(m => m.DevicePointer).ToArray());
                    outputDevicePtr.CopyToDevice(ret.Select(m => m.DevicePointer).ToArray());
                    _Use(_tensorAddPadding, newRows, newColumns, k => k.Run(0, inputDevicePtr.DevicePointer, outputDevicePtr.DevicePointer, rows, columns, newRows, newColumns, depth, padding));
                }
            return(ret);
        }
        private void updateHistoryBuffer()
        {
            if (Count == 0)
            {
                return;
            }

            if (Count > nbCurvesMax)
            {
                MyLog.ERROR.WriteLine("Number of displayed curved is too high (" + Count + ", max " + nbCurvesMax + ")");
                return;
            }

            if (m_valuesHistory != null)
            {
                m_valuesHistory.Dispose();
            }

            // Allocate the history
            int historySize = m_plotAreaWidth * Count;

            m_valuesHistory = new CudaDeviceVariable <float>(historySize);
            m_valuesHistory.Memset(0);
        }
Example #23
0
 public void Clear()
 {
     Debug.Assert(IsValid);
     _data.Memset(0);
 }
        public void MinimizeCUBLAS(int tileCountX, int tileCountY)
        {
            int shiftCount;// = shifts.Count;

            shiftCount = GetShiftCount();

            concatenateShifts.RunSafe(shifts_d, shiftPitches, AllShifts_d, shiftCount, tileCountX, tileCountY);


            shiftsMeasured.CopyToDevice(AllShifts_d);

            CudaStopWatch sw = new CudaStopWatch();

            sw.Start();


            int imageCount = frameCount;
            int tileCount  = tileCountX * tileCountY;
            int n1         = imageCount - 1;
            int m          = shiftCount;

            status.Memset(0);
            shiftMatrices.Memset(0);
            float[] shiftMatrix = CreateShiftMatrix();
            shiftMatrices.CopyToDevice(shiftMatrix, 0, 0, shiftMatrix.Length * sizeof(float));

            copyShiftMatrixKernel.RunSafe(shiftMatrices, tileCount, imageCount, shiftCount);
            shiftSafeMatrices.CopyToDevice(shiftMatrices);


            for (int i = 0; i < 10; i++)
            {
                blas.GemmBatched(Operation.Transpose, Operation.NonTranspose, n1, n1, m, one, shiftMatrixArray, m, shiftMatrixArray, m, zero, matrixSquareArray, n1, tileCount);
                //float[] mSqr = matricesSquared;

                if (n1 <= 32)
                {
                    //MatinvBatchedS can only invert up to 32x32 matrices
                    blas.MatinvBatchedS(n1, matrixSquareArray, n1, matrixInvertedArray, n1, infoInverse, tileCount);
                }
                else
                {
                    blas.GetrfBatchedS(n1, matrixSquareArray, n1, pivotArray, infoInverse, tileCount);
                    blas.GetriBatchedS(n1, matrixSquareArray, n1, pivotArray, matrixInvertedArray, n1, infoInverse, tileCount);
                }


                //int[] info = infoInverse;
                //mSqr = matricesInverted;
                blas.GemmBatched(Operation.NonTranspose, Operation.Transpose, n1, m, n1, one, matrixInvertedArray, n1, shiftMatrixArray, m, zero, solvedMatrixArray, n1, tileCount);
                blas.GemmBatched(Operation.NonTranspose, Operation.Transpose, n1, 2, m, one, solvedMatrixArray, n1, shiftMeasuredArray, 2, zero, shiftOneToOneArray, n1, tileCount);
                blas.GemmBatched(Operation.NonTranspose, Operation.NonTranspose, m, 2, n1, one, shiftMatrixArray, m, shiftOneToOneArray, n1, zero, shiftOptimArray, m, tileCount);

                checkForOutliers.RunSafe(shiftsMeasured, shiftsOptim, shiftMatrices, status, infoInverse, tileCount, imageCount, shiftCount);

                status.Sum(statusSum, buffer, 0);
                int[] stats = status;

                for (int j = 0; j < tileCount; j++)
                {
                    if (stats[j] >= 0)
                    {
                        Console.Write(j + ": " + stats[j] + "; ");
                    }
                }
                Console.WriteLine();

                int stat = statusSum;
                if (stat == -tileCount)
                {
                    break;
                }

                //float2[] AllShifts_h = shiftsMeasured;
            }

            blas.GemmBatched(Operation.NonTranspose, Operation.NonTranspose, m, 2, n1, one, shiftMatrixSafeArray, m, shiftOneToOneArray, n1, zero, shiftMeasuredArray, m, tileCount);

            AllShifts_d.Memset(0);
            transposeShifts.RunSafe(AllShifts_d, shiftsMeasured, shiftsOneToOne, shiftsOneToOne_d, tileCount, imageCount, shiftCount);
            //shiftsMeasured.CopyToDevice(AllShifts_d);

            //float2[] AllShiftsFinal_h = shiftsMeasured;

            sw.Stop();
            Console.WriteLine("Time for optimisation: " + sw.GetElapsedTime() + " msec.");

            separateShifts.RunSafe(AllShifts_d, shifts_d, shiftPitches, shiftCount, tileCountX, tileCountY);
        }
        protected override void Reset()
        {
            TextureHeight = BIN_PIXEL_HEIGHT;
            TextureWidth = BIN_PIXEL_WIDTH * BINS;

            if (m_d_HistogramData != null)
            {
                m_d_HistogramData.Dispose();
            }
            m_d_HistogramData = new CudaDeviceVariable<int>(BINS);
            m_d_HistogramData.Memset(0);
        }
Example #26
0
        static void Main(string[] args)
        {
            int cuda_device = 0;
            int nstreams = 4;               // number of streams for CUDA calls
            int nreps = 10;                 // number of times each experiment is repeated
            int n = 16 * 1024 * 1024;       // number of ints in the data set
            int nbytes = n * sizeof(int);   // number of data bytes
            dim3 threads, blocks;           // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel;   // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool bPinGenericMemory = false; // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;	// number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int temp = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                            case 0:
                                device_sync_method = CUCtxFlags.SchedAuto;
                                break;
                            case 1:
                                device_sync_method = CUCtxFlags.SchedSpin;
                                break;
                            case 2:
                                device_sync_method = CUCtxFlags.SchedYield;
                                break;
                            case 4:
                                device_sync_method = CUCtxFlags.BlockingSync;
                                break;
                            default:
                                error = true;
                                break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();
            if(0==num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);
            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1; // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                    deviceProp.MultiProcessorCount,
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                    ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;
            if (bPinGenericMemory)
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            else
                ctx = new CudaContext(cuda_device, device_sync_method);

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "simpleStreams_x64.ptx";
            else
                resName = "simpleStreams.ptx";

            string resNamespace = "simpleStreams";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");

            // allocate host memory
            int c = 5;											// value to which the array will be initialized
            int[] h_a = null;									// pointer to the array data in host memory
            CudaPageLockedHostMemory<int> hAligned_a = null;	// pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)
            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable<int> d_c = c; //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable<int> d_a = new CudaDeviceVariable<int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();     // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();   // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);

            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0); // set device memory to all 0s, for testing correctness

            start_event.Record();
            for(int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for(int i = 0; i < nstreams; i++)
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c*nreps*niterations);

            // release resources
            for(int i = 0; i < nstreams; i++) {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Example #27
0
        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count            = width * height * depth;
            int widthD           = width - 1;
            int heightD          = height - 1;
            int depthD           = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions           = new dim3(8, 8, 8);
            dim3 gridDimensions            = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D        noiseArray   = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable <Voxel> voxelsDev = new CudaDeviceVariable <Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW     = NearestPowerOfTwo(widthD);
            int nearestH     = NearestPowerOfTwo(heightD);
            int nearestD     = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable <int> trisCountDevice = new CudaDeviceVariable <int>(nearestCount);

            trisCountDevice.Memset(0);
            CudaDeviceVariable <int> offsetsDev = new CudaDeviceVariable <int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable <int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;

            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;

            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                {
                    container.Geometry.Dispose();
                }

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags      = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags    = ResourceOptionFlags.None,
                    SizeInBytes    = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage          = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);

                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                {
                    container.Geometry.Dispose();
                }
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }
Example #28
0
        static void Main(string[] args)
        {
            int   cuda_device = 0;
            int   nstreams = 4;                           // number of streams for CUDA calls
            int   nreps = 10;                             // number of times each experiment is repeated
            int   n = 16 * 1024 * 1024;                   // number of ints in the data set
            int   nbytes = n * sizeof(int);               // number of data bytes
            dim3  threads, blocks;                        // kernel launch configuration
            float elapsed_time, time_memcpy, time_kernel; // timing variables
            float scale_factor = 1.0f;

            // allocate generic memory and pin it laster instead of using cudaHostAlloc()
            // Untested in C#, so stick to cudaHostAlloc().
            bool       bPinGenericMemory  = false;                   // we want this to be the default behavior
            CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync

            int niterations;                                         // number of iterations for the loop inside the kernel

            ShrQATest.shrQAStart(args);

            Console.WriteLine("[ simpleStreams ]");

            foreach (var item in args)
            {
                if (item.Contains("help"))
                {
                    printHelp();
                    ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED);
                }
            }

            bPinGenericMemory = false;
            foreach (var item in args)
            {
                if (item.Contains("use_generic_memory"))
                {
                    bPinGenericMemory = true;
                }
            }

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i].Contains("sync_method"))
                {
                    int  temp  = -1;
                    bool error = false;
                    if (i < args.Length - 1)
                    {
                        error = int.TryParse(args[i + 1], out temp);
                        switch (temp)
                        {
                        case 0:
                            device_sync_method = CUCtxFlags.SchedAuto;
                            break;

                        case 1:
                            device_sync_method = CUCtxFlags.SchedSpin;
                            break;

                        case 2:
                            device_sync_method = CUCtxFlags.SchedYield;
                            break;

                        case 4:
                            device_sync_method = CUCtxFlags.BlockingSync;
                            break;

                        default:
                            error = true;
                            break;
                        }
                    }
                    if (!error)
                    {
                        Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]);
                        nreps = 100;
                    }
                    else
                    {
                        Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp);
                        ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
                    }
                }
            }

            int num_devices = CudaContext.GetDeviceCount();

            if (0 == num_devices)
            {
                Console.Write("your system does not have a CUDA capable device, waiving test...\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED);
            }
            cuda_device = CudaContext.GetMaxGflopsDeviceId();

            CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device);

            if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1))
            {
                Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName);
            }

            if (deviceProp.ComputeCapability.Major >= 2)
            {
                niterations = 100;
            }
            else
            {
                if (deviceProp.ComputeCapability.Minor > 1)
                {
                    niterations = 5;
                }
                else
                {
                    niterations = 1;                     // reduced workload for compute capability 1.0 and 1.1
                }
            }

            // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false
            // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case...
            if (bPinGenericMemory)
            {
                Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No");
                if (deviceProp.CanMapHostMemory == false)
                {
                    Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n");
                    bPinGenericMemory = false;
                }
            }

            // Anything that is less than 32 Cores will have scaled down workload
            scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f);
            n            = (int)Math.Round((float)n / scale_factor);

            Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor);
            Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n",
                          deviceProp.MultiProcessorCount,
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor),
                          ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount);

            Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor);
            Console.Write("> array_size   = {0}\n\n", n);

            // enable use of blocking sync, to reduce CPU usage
            Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]);

            CudaContext ctx;

            if (bPinGenericMemory)
            {
                ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost);
            }
            else
            {
                ctx = new CudaContext(cuda_device, device_sync_method);
            }

            //Load Kernel image from resources
            string resName;

            if (IntPtr.Size == 8)
            {
                resName = "simpleStreams_x64.ptx";
            }
            else
            {
                resName = "simpleStreams.ptx";
            }

            string resNamespace = "simpleStreams";
            string resource     = resNamespace + "." + resName;
            Stream stream       = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);

            if (stream == null)
            {
                throw new ArgumentException("Kernel not found in resources.");
            }

            CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array");


            // allocate host memory
            int c = 5;                                                          // value to which the array will be initialized

            int[] h_a = null;                                                   // pointer to the array data in host memory
            CudaPageLockedHostMemory <int> hAligned_a = null;                   // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT)

            //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a).
            //In C++ hAligned_a would point somewhere inside the h_a array.
            AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes);

            Console.Write("\nStarting Test\n");

            // allocate device memory
            CudaDeviceVariable <int> d_c = c;            //using new implicit cast to allocate memory and asign value
            CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int));

            CudaStream[] streams = new CudaStream[nstreams];
            for (int i = 0; i < nstreams; i++)
            {
                streams[i] = new CudaStream();
            }

            // create CUDA event handles
            // use blocking sync
            CudaEvent    start_event, stop_event;
            CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default);

            start_event = new CudaEvent(eventflags);
            stop_event  = new CudaEvent(eventflags);

            // time memcopy from device
            start_event.Record();                 // record in stream-0, to ensure that all previous CUDA calls have completed
            hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream);
            stop_event.Record();
            stop_event.Synchronize();               // block until the event is actually recorded
            time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("memcopy:\t{0:0.00}\n", time_memcpy);

            // time kernel
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            init_array.BlockDimensions = threads;
            init_array.GridDimensions  = blocks;
            init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations);
            stop_event.Record();
            stop_event.Synchronize();
            time_kernel = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("kernel:\t\t{0:0.00}\n", time_kernel);


            //////////////////////////////////////////////////////////////////////
            // time non-streamed execution for reference
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)threads.x, 1);
            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations);
                hAligned_a.SynchronCopyToHost(d_a);
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy);

            //////////////////////////////////////////////////////////////////////
            // time execution with nstreams streams
            threads = new dim3(512, 1);
            blocks  = new dim3(n / (int)(nstreams * threads.x), 1);
            byte[] memset = new byte[nbytes];             // set host memory bits to all 1s, for testing correctness
            for (int i = 0; i < nbytes; i++)
            {
                memset[i] = 255;
            }
            System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes);
            d_a.Memset(0);             // set device memory to all 0s, for testing correctness

            start_event.Record();
            for (int k = 0; k < nreps; k++)
            {
                init_array.BlockDimensions = threads;
                init_array.GridDimensions  = blocks;
                // asynchronously launch nstreams kernels, each operating on its own portion of data
                for (int i = 0; i < nstreams; i++)
                {
                    init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations);
                }

                // asynchronously launch nstreams memcopies.  Note that memcopy in stream x will only
                //   commence executing when all previous CUDA calls in stream x have completed
                for (int i = 0; i < nstreams; i++)
                {
                    hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream);
                }
            }
            stop_event.Record();
            stop_event.Synchronize();
            elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event);
            Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams);

            // check whether the output is correct
            Console.Write("-------------------------------\n");
            //We can directly access data in hAligned_a using the [] operator, but copying
            //data first to h_a is faster.
            System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int));

            bool bResults = correct_data(h_a, n, c * nreps * niterations);

            // release resources
            for (int i = 0; i < nstreams; i++)
            {
                streams[i].Dispose();
            }
            start_event.Dispose();
            stop_event.Dispose();

            hAligned_a.Dispose();
            d_a.Dispose();
            d_c.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();

            Console.ReadKey();
            ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
Example #29
0
 public void Clear()
 {
     _data.Memset(0);
 }
Example #30
0
        public void Run(MatOperation operation, CudaDeviceVariable <float> A, int ACount, int AColumnHint, CudaDeviceVariable <float> B, int BCount, int BColumnHint, CudaDeviceVariable <float> Result, int ResultCount, int ResultColumnHint, float beta = 1.0f)
        {
            Result.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0.0f), 0));

            switch (operation)
            {
            case MatOperation.Multiplication:                                                                                         // vectors/matrices have to be always in the correct dimesions!
                if (BCount > 1 && ACount >= 1 && BColumnHint == 1 && ACount / AColumnHint > 1 && BCount / BColumnHint == AColumnHint) //. A*vecB
                {
                    MyCublasFactory.Instance.Gemv(Operation.Transpose,                                                                // transpose beacuase it does Ax row wise if x is a row vector :D
                                                  AColumnHint, ACount / AColumnHint, 1.0f,
                                                  A, AColumnHint,
                                                  B, 1,
                                                  beta, Result, 1);
                }
                else if (ACount >= 1 && BCount > 1 && ACount / AColumnHint == 1 && BColumnHint > 1 && BCount / BColumnHint == AColumnHint) // vecA*B
                {
                    MyCublasFactory.Instance.Gemv(Operation.NonTranspose,                                                                  // transpose beacuase it does Ax row wise if x is a row vector :D
                                                  BColumnHint, BCount / BColumnHint, 1.0f,
                                                  B, BColumnHint,
                                                  A, 1,
                                                  beta, Result, 1);
                }
                else if (ACount / AColumnHint == 1 && BColumnHint == 1 && ACount > 1 && BCount > 1)     //. trans(vecA) * vecB
                {
                    Run(MatOperation.DotProd, A, ACount, AColumnHint, B, BCount, BColumnHint, Result, ResultCount, ResultColumnHint, beta);
                }
                else if (ACount != 1 || BCount != 1)    // A*B   matrix multiplication
                {
                    // Cublas is using fortran matrices.. thus tey have to be swapped such as described in: http://peterwittek.com/cublas-matrix-c-style.html
                    int m   = BColumnHint;
                    int n   = ACount / AColumnHint;
                    int k   = AColumnHint;
                    int lda = BColumnHint;
                    int ldb = AColumnHint;
                    int ldc = ResultColumnHint;
                    MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.NonTranspose,
                                                  m, n, k, 1.0f,
                                                  B, lda,
                                                  A, ldb,
                                                  beta, Result, ldc);
                }
                break;

            case MatOperation.DotProd:

                if (ACount != BCount || ResultCount != 1)
                {
                    MyLog.Writer.WriteLine(MyLogLevel.ERROR, callee.Name + ": Inconsistent vector dimensions for MyMatrixCublasOps.");
                    break;
                }

                MyCublasFactory.Instance.Gemv(Operation.Transpose,      // transpose beacuase it does Ax row wise if x is a row vector :D
                                              ACount, 1, 1.0f,
                                              A, ACount,
                                              B, 1,
                                              beta, Result, 1);
                break;

            default:
                MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation");
                break;
            }
        }
        public void Run(MatOperation operation, CudaDeviceVariable<float> A, int ACount, int AColumnHint, CudaDeviceVariable<float> B, int BCount, int BColumnHint, CudaDeviceVariable<float> Result, int ResultCount, int ResultColumnHint, float beta = 1.0f)
        {
            Result.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0.0f), 0));

            switch (operation)
            {
                case MatOperation.Multiplication:  // vectors/matrices have to be always in the correct dimesions!
                    if (BCount > 1 && ACount >= 1 && BColumnHint == 1 && ACount / AColumnHint > 1 && BCount / BColumnHint == AColumnHint) //. A*vecB
                    {
                        MyCublasFactory.Instance.Gemv(Operation.Transpose,  // transpose beacuase it does Ax row wise if x is a row vector :D
                            AColumnHint, ACount / AColumnHint, 1.0f,
                            A, AColumnHint,
                            B, 1,
                            beta, Result, 1);
                    }
                    else if (ACount >= 1 && BCount > 1 && ACount / AColumnHint == 1 && BColumnHint > 1 && BCount / BColumnHint == AColumnHint)  // vecA*B
                    {
                        MyCublasFactory.Instance.Gemv(Operation.NonTranspose,  // transpose beacuase it does Ax row wise if x is a row vector :D
                            BColumnHint, BCount / BColumnHint, 1.0f,
                            B, BColumnHint,
                            A, 1,
                            beta, Result, 1);
                    }
                    else if (ACount / AColumnHint == 1 && BColumnHint == 1 && ACount > 1 && BCount > 1) //. trans(vecA) * vecB
                    {
                        Run(MatOperation.DotProd, A, ACount, AColumnHint, B, BCount, BColumnHint, Result, ResultCount, ResultColumnHint, beta);
                    }
                    else if (ACount != 1 || BCount != 1)// A*B   matrix multiplication
                    {
                        // Cublas is using fortran matrices.. thus tey have to be swapped such as described in: http://peterwittek.com/cublas-matrix-c-style.html
                        int m = BColumnHint;
                        int n = ACount / AColumnHint;
                        int k = AColumnHint;
                        int lda = BColumnHint;
                        int ldb = AColumnHint;
                        int ldc = ResultColumnHint;
                        MyCublasFactory.Instance.Gemm(Operation.NonTranspose, Operation.NonTranspose,
                            m, n, k, 1.0f,
                            B, lda,
                            A, ldb,
                            beta, Result, ldc);
                    }
                    break;
                case MatOperation.DotProd:

                    if (ACount != BCount || ResultCount != 1)
                    {
                        MyLog.Writer.WriteLine(MyLogLevel.ERROR, callee.Name + ": Inconsistent vector dimensions for MyMatrixCublasOps.");
                        break;
                    }

                    MyCublasFactory.Instance.Gemv(Operation.Transpose,  // transpose beacuase it does Ax row wise if x is a row vector :D
                       ACount, 1, 1.0f,
                       A, ACount,
                       B, 1,
                       beta, Result, 1);
                    break;
                default:
                    MyLog.Writer.WriteLine(MyLogLevel.ERROR, "Trying to run cublas for undefined MatOperation");
                    break;
            }
        }
Example #32
0
        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count = width * height * depth;
            int widthD = width - 1;
            int heightD = height - 1;
            int depthD = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions = new dim3(8, 8, 8);
            dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW = NearestPowerOfTwo(widthD);
            int nearestH = NearestPowerOfTwo(heightD);
            int nearestD = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
            trisCountDevice.Memset(0);
            CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;
            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;
            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                    container.Geometry.Dispose();

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags = ResourceOptionFlags.None,
                    SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
                
                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                    container.Geometry.Dispose();
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }
Example #33
0
        public override bool Reallocate(int newCount, bool copyData = true)
        {
            // TODO(HonzaS): Some of the current models need this during Execute().
            // TODO(HonzaS): Research will have to switch to the new model, but there is no reason to forbid it now.

            //// TODO(HonzaS): The simulation should be accessible in a better way.
            //if (!Owner.Owner.SimulationHandler.Simulation.IsStepFinished)
            //    throw new InvalidOperationException("Reallocate called from Execute()");

            if (!IsDynamic)
            {
                MyLog.ERROR.WriteLine(
                    "Cannot reallocate a static memory block. Use the DynamicAttribute to mark a memory block as dynamic.");
                throw new InvalidOperationException("Cannot reallocate non-dynamic memory block.");
            }

            MyLog.DEBUG.WriteLine("Reallocating {0} from {1} to {2}", Name, Count, newCount);

            int oldCount = Count;

            Count = newCount;

            if (oldCount == 0)
            {
                AllocateDevice();
            }

            // Make sure that both the host and device have enough memory. Allocate first.
            // If one of the allocations fails, return (moving out of scope will get rid of any allocated memory).

            T[] newHostMemory;
            CudaDeviceVariable <T> newDeviceMemory;

            try
            {
                newHostMemory = new T[newCount];
            }
            catch
            {
                //MyLog.WARNING.WriteLine("Could not reallocate host memory.");
                return(false);
            }

            try
            {
                newDeviceMemory = new CudaDeviceVariable <T>(
                    MyKernelFactory.Instance.GetContextByGPU(Owner.GPU).AllocateMemory(
                        newCount * ESize));

                newDeviceMemory.Memset(BitConverter.ToUInt32(BitConverter.GetBytes(0), 0));
            }
            catch
            {
                //MyLog.WARNING.WriteLine("Could not reallocate device memory.");
                return(false);
            }

            // Both the host and the device have enough memory for the reallocation.

            if (copyData)
            {
                // Copy the host data.
                Array.Copy(Host, newHostMemory, Math.Min(newCount, oldCount));

                // Copy the device data.
                newDeviceMemory.CopyToDevice(Device[Owner.GPU]);
            }

            // This will get rid of the original host memory.
            Host = newHostMemory;

            // Explicit dispose so that if there's a reference anywhere, we'll find out.
            MyLog.DEBUG.WriteLine("Disposing device memory in Reallocate()");
            Device[Owner.GPU].Dispose();
            Device[Owner.GPU] = newDeviceMemory;

            return(true);
        }
Example #34
0
        protected override void Execute()
        {
            // Clear screen on simulation start
            if (!isScreenClear)
            {
                Clear();
                isScreenClear = true;
            }

            //we are able to represent all characters from ' ' (space) to '~' (tilda) and new-line
            int desiredNum = '~' - ' ' + 2; // the last character is \n

            MyMemoryBlock <float> target = (MyMemoryBlock <float>)Target;

            if (target != null)
            {
                //get data to cpu
                Target.SafeCopyToHost();

                //allow inputs that are different in size, only clamp it if neccessary
                int size = Math.Min(target.Host.Length, desiredNum);

                //find max value for character
                int   idx    = 0;
                float maxVal = target.Host[0];
                for (int i = 1; i < size; ++i)
                {
                    if (target.Host[idx] < target.Host[i])
                    {
                        idx = i;
                    }
                }

                //reconstruct a character
                char newChar = '\n';
                if (idx + 1 != desiredNum)
                {
                    newChar = (char)(' ' + idx);
                }

                bool splitOccured = false;
                //add character to history but split line it it is too long
                if (newChar == '\n')
                {
                    m_History.Add("");
                }
                else if (m_History[m_History.Count - 1].Length >= m_Cols - 1)
                {
                    m_History[m_History.Count - 1] += "\\";
                    m_History.Add(newChar.ToString());
                    splitOccured = true;
                }
                else
                {
                    m_History[m_History.Count - 1] += newChar;
                }

                if (m_History.Count > m_Rows)
                {
                    int rowIdx = 0;
                    //reset gpu data
                    m_HistoryDeviceBuffer.Memset(0);

                    m_History.RemoveAt(0);

                    foreach (string s in m_History)
                    {
                        int colIdx = 0;
                        foreach (char c in s)
                        {
                            m_HistoryDeviceBuffer[m_Cols * rowIdx + colIdx] = (float)(c - ' ');
                            colIdx += 1;
                        }
                        rowIdx += 1;
                    }

                    Clear();

                    for (rowIdx = 0; rowIdx < m_History.Count; ++rowIdx)
                    {
                        MyDrawStringHelper.DrawStringFromGPUMem(m_HistoryDeviceBuffer, 0, rowIdx * (MyDrawStringHelper.CharacterHeight + 1), 0, 0x999999, VBODevicePointer, TextureWidth, TextureHeight, rowIdx * m_Cols, m_Cols);
                    }
                }

                else
                {
                    int    lastRow    = m_History.Count - 1;
                    String lastString = m_History[lastRow];
                    if (lastString.Length > 0)
                    {
                        m_HistoryDeviceBuffer[m_Cols * lastRow + lastString.Length - 1] = (float)(lastString.Last() - ' ');
                    }

                    if (splitOccured)
                    {
                        m_HistoryDeviceBuffer[m_Cols * lastRow - 1] = (float)('\\' - ' ');
                        MyDrawStringHelper.DrawStringFromGPUMem(m_HistoryDeviceBuffer, 0, (lastRow - 1) * (MyDrawStringHelper.CharacterHeight + 1), 0, 0x999999, VBODevicePointer, TextureWidth, TextureHeight, (lastRow - 1) * m_Cols, m_Cols);
                    }
                    MyDrawStringHelper.DrawStringFromGPUMem(m_HistoryDeviceBuffer, 0, lastRow * (MyDrawStringHelper.CharacterHeight + 1), 0, 0x999999, VBODevicePointer, TextureWidth, TextureHeight, lastRow * m_Cols, m_Cols);
                }
            }
        }
Example #35
0
        protected override void Reset()
        {
            base.Reset();

            isScreenClear = false;

            // Allocate the history
            m_HistoryDeviceBuffer = new CudaDeviceVariable<float>(m_Rows * m_Cols);
            m_HistoryDeviceBuffer.Memset(0);

            m_History = new List<string>();
            m_History.Add("");
        }