public void CopyGpu(Tensor result, Tensor src, long totalElements) { // We assume here that we are using the default stream for both devices. TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaStorage resultStorage = (CudaStorage)result.Storage; CudaContext resultContext = context.CudaContextForTensor(result); CUdeviceptr resultPtr = resultStorage.DevicePtrAtElement(result.StorageOffset); CudaStorage srcStorage = (CudaStorage)src.Storage; CudaContext srcContext = context.CudaContextForTensor(src); CUdeviceptr srcPtr = srcStorage.DevicePtrAtElement(src.StorageOffset); if (CudaHelpers.GetDeviceId(result) != CudaHelpers.GetDeviceId(src)) { // Cross-device copy. Perform two-way barrier between both devices' default streams. resultContext.SetCurrent(); CudaEvent dstReady = new CudaEvent(CUEventFlags.DisableTiming); dstReady.Record(); srcContext.SetCurrent(); CUResult res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(CUstream.NullStream, dstReady.Event, 0); if (res != CUResult.Success) { throw new CudaException(res); } dstReady.Dispose(); } else { srcContext.SetCurrent(); } bool canMemcpy = CanMemcpy(result, src, totalElements); if (canMemcpy) { CUResult res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyAsync( resultPtr, srcPtr, totalElements * src.ElementType.Size(), CUstream.NullStream); if (res != CUResult.Success) { throw new CudaException(res); } } else { if (result.ElementType != src.ElementType) { CopyGpuConvertTypes(result, src, totalElements); } else if (context.CanAccessPeer(CudaHelpers.GetDeviceId(src), CudaHelpers.GetDeviceId(result))) { CopyGpuDirect(result, src, srcContext); } else { CopyGpuIndirect(result, src, totalElements); } } }