private List <int> getGpus(int nMax) { CudaDnn <float> cuda = new CudaDnn <float>(0); List <int> rgGpu = new List <int>(); int nDevCount = cuda.GetDeviceCount(); for (int i = 0; i < nDevCount; i++) { string strDevInfo = cuda.GetDeviceInfo(i, true); string strP2PInfo = cuda.GetDeviceP2PInfo(i); if (strP2PInfo.Contains("P2P Capable = YES")) { rgGpu.Add(i); } if (rgGpu.Count == nMax) { break; } } cuda.Dispose(); return(rgGpu); }
static void Main(string[] args) { // Create the output log used. Log log = new Log("Test"); log.OnWriteLine += Log_OnWriteLine; // Create the CudaDnn connection used. NOTE: only one CudaDnn connection is needed // per thread for each instance creates and manages its own low-level kernel state // which includes all memory allocated etc. All memory handles allocated should // be used with the CudaDnn that allocated the memory. CudaDnn <float> cuda = new CudaDnn <float>(0, DEVINIT.CUBLAS | DEVINIT.CURAND); log.WriteLine("CudaDnn created."); // Run super simple sample. runSuperSimpleSample(cuda, log); // Run Blob sample #1 runSimpleBlobExample1(cuda, log); // Run Blob sample #2 runSimpleBlobExample2(cuda, log); // Run Blob sample #3 runSimpleBlobExample3(cuda, log); // Release all GPU memory and other state data used. cuda.Dispose(); }
protected virtual void dispose() { if (m_bResetOnCleanUp) { CudaDnn <float> cuda = new CudaDnn <float>(0, DEVINIT.NONE); cuda.ResetDevice(); cuda.Dispose(); } }
protected virtual void dispose() { if (m_bResetOnCleanUp) { CudaDnn <float> cuda = new CudaDnn <float>(0, DEVINIT.NONE); cuda.ResetDevice(); cuda.Dispose(); } if (m_defaultCulture != null) { Thread.CurrentThread.CurrentCulture = m_defaultCulture; } }
/// <summary> /// Main demonstration function. /// </summary> /// <param name="args"></param> static void Main(string[] args) { // Get the ONNX file to import. string strOnnxModelUrl = "https://github.com/onnx/models/raw/master/vision/classification/alexnet/model/bvlcalexnet-9.onnx"; string strOnnxFile = downloadFile(strOnnxModelUrl); // Create the MyCaffe conversion control MyCaffeConversionControl <float> convert = new MyCaffeConversionControl <float>(); CudaDnn <float> cuda = new CudaDnn <float>(0); Log log = new Log("Onnx Test"); // Convert an ONNX model file into the MyCaffe model description prototxt and weight protobuf. MyCaffeModelData modeldata = convert.ConvertOnnxToMyCaffeFromFile(cuda, log, strOnnxFile); // Use the model description prototxt (same format used by CAFFE)... string strModelDesc = modeldata.ModelDescription; // And weights in binary protbuf format (same format used by CAFFE)... byte[] rgWeights = modeldata.Weights; // along with the solver descriptor of your choice to use the model. Console.WriteLine("================================"); Console.WriteLine("IMPORT: Model imported from *.onnx"); Console.WriteLine("================================"); Console.WriteLine(strModelDesc); Console.WriteLine("--done--"); // Convert a MyCaffe model file (and weights) into the equivalent ONNX model file. Console.WriteLine("================================"); Console.WriteLine("EXPORT: Model exported to *.onnx"); Console.WriteLine("================================"); string strOnnxOutFile = TestDataPath + "\\bvlc_allexnet.onnx"; if (File.Exists(strOnnxOutFile)) { File.Delete(strOnnxOutFile); } // Convert the MyCaffe model file (and weights) back into a new ONNX model file. convert.ConvertMyCaffeToOnnxFile(cuda, log, modeldata, strOnnxOutFile); Console.WriteLine("Exported model to '" + strOnnxOutFile + "'."); // Cleanup. cuda.Dispose(); convert.Dispose(); Console.WriteLine("Press any key to continue..."); Console.ReadKey(); }
private void m_bwInit_DoWork(object sender, DoWorkEventArgs e) { List <string> rgstrGpu = new List <string>(); // Setup the GPU menu with all GPU's in the system and // select the first GPU as the default for testing. CudaDnn <float> cuda = new CudaDnn <float>(0); int nDeviceCount = cuda.GetDeviceCount(); for (int i = 0; i < nDeviceCount; i++) { string strDevice = cuda.GetDeviceName(i); rgstrGpu.Add(strDevice); } cuda.Dispose(); e.Result = rgstrGpu; }
protected virtual void dispose() { m_cuda.Dispose(); m_cuda = null; }
static void Main(string[] args) { // This memory will reside on the GPU. long hGpuMem = 0; Console.WriteLine("Creating CudaCuDnn..."); CudaDnn <float> cuda = new CudaDnn <float>(0); try { string strDeviceInfo = cuda.GetDeviceName(0); Console.WriteLine(strDeviceInfo); strDeviceInfo = cuda.GetDeviceP2PInfo(0); Console.WriteLine(strDeviceInfo); strDeviceInfo = cuda.GetDeviceInfo(0, true); Console.WriteLine(strDeviceInfo); List <long> rghGpuMem = new List <long>(); long lOffset = 0; // You must first allocate the GPU memory to use. // Below we will allocate an array of 1000 float values. Console.WriteLine("Allocate 1000 items..."); hGpuMem = cuda.AllocMemory(1000); cuda.set(1000, hGpuMem, 0.0); Console.WriteLine("Create memory pointers..."); for (int i = 0; i < 10; i++) { long hMem1 = cuda.CreateMemoryPointer(hGpuMem, lOffset, 100); cuda.set(100, hMem1, (double)(i + 1)); rghGpuMem.Add(hMem1); lOffset += 100; } Console.WriteLine("Test memory..."); for (int i = 0; i < 10; i++) { long hMem1 = rghGpuMem[i]; float[] rgData = cuda.GetMemoryFloat(hMem1); if (rgData.Length != 100) { throw new Exception("The data length should = 100!"); } for (int j = 0; j < 100; j++) { if (rgData[j] != (float)(i + 1)) { throw new Exception("The data at index " + j.ToString() + " is not correct!"); } } } Console.WriteLine("Memory test passed successfully!"); } catch (Exception excpt) { Console.WriteLine("ERROR: " + excpt.Message); } finally { // Clean-up and release all GPU memory used. if (hGpuMem != 0) { cuda.FreeMemory(hGpuMem); hGpuMem = 0; } cuda.Dispose(); } Console.WriteLine("Press any key to exit."); Console.Read(); }
static void Main(string[] args) { // Create the output log used. Log log = new Log("Test"); // Create the CudaDnn connection used. NOTE: only one CudaDnn connection is needed // per thread for each instance creates and manages its own low-level kernel state // which includes all memory allocated etc. All memory handles allocated should // be used with the CudaDnn that allocated the memory. CudaDnn <float> cuda = new CudaDnn <float>(0, DEVINIT.CUBLAS | DEVINIT.CURAND); MemoryDataLayer <float> layer = createMemoryDataLayer(cuda, log); List <Datum> rgData = dataSetter(); Blob <float> blobData = new Blob <float>(cuda, log); Blob <float> blobLabel = new Blob <float>(cuda, log); BlobCollection <float> colBottom = new BlobCollection <float>(); BlobCollection <float> colTop = new BlobCollection <float>(); // Set the top blob for MemoryDataLayers only have tops (e.g. no bottoms). colTop.Add(blobData); colTop.Add(blobLabel); layer.Setup(colBottom, colTop); layer.AddDatumVector(rgData); // Run Pass 1 - memory data layer advances intern index by batch size after forward completes. layer.Forward(colBottom, colTop); float[] rgDataPass1 = colTop[0].mutable_cpu_data; float[] rgLabelPass1 = colTop[1].mutable_cpu_data; log.CHECK_EQ(rgDataPass1.Length, 60, "There should be 60 data items."); for (int i = 0; i < rgDataPass1.Length; i++) { log.CHECK_EQ(rgDataPass1[i], 10, "The data value should = 10."); } log.CHECK_EQ(rgLabelPass1.Length, 1, "There should only be one label, for the batch size = 1."); log.CHECK_EQ(rgLabelPass1[0], 0, "The label of the first item should = 0."); Console.WriteLine("First Pass - label = " + rgLabelPass1[0].ToString()); // Pass 2 - memory data layer advances intern index by batch size after forward completes. layer.Forward(colBottom, colTop); float[] rgDataPass2 = colTop[0].mutable_cpu_data; float[] rgLabelPass2 = colTop[1].mutable_cpu_data; log.CHECK_EQ(rgDataPass2.Length, 60, "There should be 60 data items."); for (int i = 0; i < rgDataPass2.Length; i++) { log.CHECK_EQ(rgDataPass2[i], 10, "The data value should = 10."); } log.CHECK_EQ(rgLabelPass2.Length, 1, "There should only be one label, for the batch size = 1."); log.CHECK_EQ(rgLabelPass2[0], 1, "The label of the first item should = 1."); Console.WriteLine("Second Pass - label = " + rgLabelPass2[0].ToString()); // Pass 3 - memory data layer advances intern index by batch size after forward completes. layer.Forward(colBottom, colTop); float[] rgDataPass3 = colTop[0].mutable_cpu_data; float[] rgLabelPass3 = colTop[1].mutable_cpu_data; log.CHECK_EQ(rgDataPass3.Length, 60, "There should be 60 data items."); for (int i = 0; i < rgDataPass3.Length; i++) { log.CHECK_EQ(rgDataPass3[i], 10, "The data value should = 10."); } log.CHECK_EQ(rgLabelPass3.Length, 1, "There should only be one label, for the batch size = 1."); log.CHECK_EQ(rgLabelPass3[0], 2, "The label of the first item should = 2."); Console.WriteLine("Third Pass - label = " + rgLabelPass3[0].ToString()); layer.Dispose(); blobData.Dispose(); blobLabel.Dispose(); cuda.Dispose(); Console.WriteLine("Press any key..."); Console.ReadKey(); }
private void Worker_DoWork(object sender, ActionStateArgs <T> e) { SolverInfo <T> info = e.Arg as SolverInfo <T>; NCCL <T> nccl = null; m_cuda = new common.CudaDnn <T>(e.DeviceID, DEVINIT.CUBLAS | DEVINIT.CURAND, null, info.CudaPath); try { Solver <T> rank0 = info.Rank0; Log log = new Log("Worker solver for DeviceID = " + e.DeviceID.ToString()); //----------------------------------------- // Transfer the NCCL handle from the // main kernel that created it to the // one used by the CudaDnn on this thread. // // After the copy, this thread will 'own' // the nccl and be responsible for its // destruction. //----------------------------------------- long hNccl = m_cuda.KernelCopyNccl(info.KernelHandle, info.NcclHandle); // Create solver and install callbacks SolverParameter param = rank0.parameter.Clone(); param.device_id = e.DeviceID; param.type = rank0.parameter.type; Solver <T> solver = Solver <T> .Create(m_cuda, log, param, rank0.CancelEvent, null, null, rank0.Database, null, rank0.solver_count, info.SolverRank); info.StartedEvent.Set(); log.CHECK_EQ((int)solver.type, (int)rank0.type, "The solver types should be the same."); //----------------------------------------- // Turn off logging for all other // operations on the worker thread. //----------------------------------------- log.Enable = false; nccl = new NCCL <T>(m_cuda, log, solver, e.DeviceID, hNccl, info.GradientReadyEvents); info.InitializedEvent.Set(); m_cuda.SynchronizeDevice(); List <WaitHandle> rgWait = new List <WaitHandle>(); rgWait.AddRange(rank0.CancelEvent.Handles); rgWait.Add(info.AllCreatedEvent); int nWait = WaitHandle.WaitAny(rgWait.ToArray()); if (nWait < rgWait.Count - 1) { return; } nccl.Broadcast(); int nIterations = param.max_iter - solver.iter; if (info.IterationOverride > 0) { nIterations = info.IterationOverride; } solver.Step(nIterations); solver.Dispose(); } catch (Exception excpt) { info.Error = excpt; info.ErrorEvent.Set(); } finally { if (nccl != null) { nccl.Dispose(); } m_cuda.Dispose(); m_cuda = null; } }