/// <summary>
        /// Does the copy from device.
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="devArray">The dev array.</param>
        /// <param name="devOffset">The dev offset.</param>
        /// <param name="hostArray">The host array.</param>
        /// <param name="hostOffset">The host offset.</param>
        /// <param name="count">The count.</param>
        protected override void DoCopyFromDevice <T>(Array devArray, int devOffset, Array hostArray, int hostOffset, int count)
        {
            EmuDevicePtrEx devPtr = (EmuDevicePtrEx)GetDeviceMemory(devArray);

            DoCopy <T>(devPtr.DevPtr, devPtr.Offset + devOffset, hostArray, hostOffset, count);
            //Array.Copy(devArray, devOffset, hostArray, hostOffset, count);
        }
        /// <summary>
        /// Copies between preallocated arrays on device.
        /// </summary>
        /// <typeparam name="T">Blittable type.</typeparam>
        /// <param name="srcDevArray">The source device array.</param>
        /// <param name="dstDevArray">The destination device array.</param>
        public override void CopyOnDevice <T>(T[] srcDevArray, T[] dstDevArray)
        {
            EmuDevicePtrEx srcPtrEx = (EmuDevicePtrEx)GetDeviceMemory(srcDevArray);
            EmuDevicePtrEx dstPtrEx = (EmuDevicePtrEx)GetDeviceMemory(dstDevArray);

            Array.Copy(srcPtrEx.DevPtr, srcPtrEx.Offset, dstPtrEx.DevPtr, dstPtrEx.Offset, Math.Min(srcPtrEx.TotalSize, dstPtrEx.TotalSize));
        }
        protected override void DoCopyDeviceToDevice <T>(Array srcDevArray, int srcOffset, GPGPU peer, Array dstDevArray, int dstOffet, int count)
        {
            EmuDevicePtrEx srcPtrEx = (EmuDevicePtrEx)GetDeviceMemory(srcDevArray);
            EmuDevicePtrEx dstPtrEx = (EmuDevicePtrEx)peer.GetDeviceMemory(dstDevArray);

            Array.Copy(srcPtrEx.DevPtr, srcPtrEx.Offset + srcOffset, dstPtrEx.DevPtr, dstPtrEx.Offset + dstOffet, count);
        }
        /// <summary>
        /// Does the set.
        /// </summary>
        /// <typeparam name="T"></typeparam>
        /// <param name="devArray">The dev array.</param>
        /// <param name="offset">The offset.</param>
        /// <param name="count">The count.</param>
        protected override void DoSet <T>(Array devArray, int offset = 0, int count = 0)
        {
            VerifyOnGPU(devArray);
            EmuDevicePtrEx ptrEx = (EmuDevicePtrEx)GetDeviceMemory(devArray);

            if (count == 0)
            {
                count = ptrEx.TotalSize;
            }
            Array.Clear(ptrEx.DevPtr, ptrEx.Offset + offset, count);
        }
        protected override Array DoCast <T, U>(int offset, Array devArray, int x, int y, int z)
        {
            if (typeof(T) != typeof(U))
            {
                throw new CudafyHostException(CudafyHostException.csX_NOT_SUPPORTED, "Casting between types on Emulator");
            }
            T[, ,] devMemPtr = new T[0, 0, 0];
            EmuDevicePtrEx ptrEx = (EmuDevicePtrEx)GetDeviceMemory(devArray);

            ptrEx = new EmuDevicePtrEx(offset, ptrEx.DevPtr, x, y, z);
            AddToDeviceMemory(devMemPtr, ptrEx);
            return(devMemPtr);
        }
        //        protected override void DoLaunch(dim3 gridSize, dim3 blockSize, int streamId, KernelMethodInfo gpuMethodInfo, params object[] arguments)
        //        {
        //            if (streamId > -1 && !_streams.ContainsKey(streamId))
        //                _streams.Add(streamId, streamId);


        //            MethodInfo mi = gpuMethodInfo.Method;
        //            if (mi == null)
        //                throw new CudafyHostException(CudafyHostException.csX_NOT_SET, gpuMethodInfo.Name);
        //            bool isStatic = mi.IsStatic;
        //            object instance = isStatic ? null : Activator.CreateInstance(mi.DeclaringType);
        //            if (gpuMethodInfo.IsDummy)
        //            {
        //                mi.Invoke(instance, arguments);
        //                return;
        //            }
        //            //List<Type> paramTypes = new List<Type>();
        //            //mi.Parameters().ToList().ForEach(p => paramTypes.Add(p.ParameterType));
        //            MethodInvoker imi = null;
        //            //mi.DeclaringType.DelegateForCallMethod(mi.Name,
        //                //typeof(GThread), typeof(byte[]), typeof(long), typeof(uint[]));   //mi.DelegateForCallMethod();
        //            StaticMethodInvoker smi = null;
        //            if(isStatic)
        //                smi = mi.DelegateForCallStaticMethod();
        //            else
        //                imi = mi.DelegateForCallMethod();

        //            GGrid grid = new GGrid(gridSize);
        //            for (int x = 0; x < gridSize.x; x++)
        //            {
        //                for (int y = 0; y < gridSize.y; y++)
        //                {
        //                    int totalSize = blockSize.x * blockSize.y * blockSize.z;
        //                    Thread[] threads = new Thread[totalSize];
        //                    IAsyncResult[] ars = new IAsyncResult[totalSize];
        //                    GBlock blk2lnch = new GBlock(grid, blockSize, x, y);
        //                    int tCtr = 0;
        //                    for (int tx = 0; tx < blockSize.x; tx++)
        //                    {
        //                        for (int ty = 0; ty < blockSize.y; ty++)
        //                        {
        //                            GThread ht = new GThread(tx, ty, blk2lnch);
        //                            object[] pList = BuildParameterList(mi, ht, arguments);

        //#warning OPTIMIZATION if there is no synchronize then start and join threads in multiple of processor count - check this in disassembly and put flag in gpuMethodInfo
        //                            //threads[tCtr] = new Thread(() =>
        //                            //{
        //                            IAsyncResult ar = null;
        //                            if(isStatic)
        //                                ar = smi.BeginInvoke(pList, null, null);
        //                            else
        //                                ar = imi.BeginInvoke(instance, pList, null, null);
        //                                //if (mi.IsStatic)
        //                                //    mi.Call(pList);
        //                                //else
        //                                //    mi.Call(instance, pList);
        //                           // });

        //                            //mi.Call(instance, pList);
        //                            //threads[tCtr].Name = string.Format("Grid_{0}_{1}_Thread_{2}_{3}", x, y, tx, ty);
        //                            //threads[tCtr].Start();
        //                            //if (ctr % 16 == 0)
        //                            //    Console.WriteLine("Ctr=" + ctr.ToString());
        //                            //ctr++;
        //                            ars[tCtr] = ar;
        //                            tCtr++;
        //                        }
        //                    }

        //                    for (int i = 0; i < totalSize; i++)
        //                    {
        //                        //threads[i].Join();
        //                        //Console.WriteLine("Thread {0} exited.", threads[i].Name);
        //                        if (isStatic)
        //                            smi.EndInvoke(ars[i]);
        //                        else
        //                            imi.EndInvoke(ars[i]);
        //                    }
        //                }
        //            }
        //        }

        private object[] BuildParameterList2(MethodInfo mi, object[] userArgs, out Dictionary <Array, EmuDevicePtrEx> dic)
        {
            dic = new Dictionary <Array, EmuDevicePtrEx>();
            List <object> prms  = new List <object>();
            int           iArgs = 0;

            ParameterInfo[] piArray = mi.GetParameters();
            for (int iParams = 0; iParams < piArray.Length; iParams++)
            {
                ParameterInfo pi = piArray[iParams];
                if (pi.ParameterType == typeof(GThread))
                {
                    prms.Add(new GThread(0, 0, null));
                }
                else if (iArgs < userArgs.Length)
                {
                    object o = userArgs[iArgs++];
                    if (!(o is GThread))
                    {
                        if (!pi.ParameterType.IsArray && o.GetType().IsArray&& !pi.IsOut && !pi.ParameterType.IsByRef)
                        {
                            EmuDevicePtrEx ptrEx = (EmuDevicePtrEx)GetDeviceMemory(o);
                            prms.Add(ptrEx.DevPtr.GetValue(0));
                            //prms.Add((o as Array).GetValue(0));
                        }
                        else if (pi.IsOut)
                        {
                            throw new CudafyHostException(CudafyHostException.csPARAMETER_PASSED_BY_REFERENCE_X_NOT_CURRENTLY_SUPPORTED, "out");
                        }
                        else if (pi.ParameterType.IsByRef)
                        {
                            throw new CudafyHostException(CudafyHostException.csPARAMETER_PASSED_BY_REFERENCE_X_NOT_CURRENTLY_SUPPORTED, "ref");
                        }
                        else if (o.GetType().IsArray)
                        {
                            EmuDevicePtrEx ptrEx = (EmuDevicePtrEx)GetDeviceMemory(o);
                            if (ptrEx.Offset == 0 && ptrEx.DevPtr.Rank == pi.ParameterType.GetArrayRank())
                            {
                                prms.Add(ptrEx.DevPtr);
                            }
                            else
                            {
                                Array tempArray = Array.CreateInstance(pi.ParameterType.GetElementType(), ptrEx.GetDimensions());
                                DoCopy(ptrEx.DevPtr, ptrEx.Offset, tempArray, 0, ptrEx.TotalSize, pi.ParameterType.GetElementType());
                                prms.Add(tempArray);
                                dic.Add(tempArray, ptrEx);
                            }
                        }
                        else
                        {
                            prms.Add(o);
                        }
                    }
                    else
                    {
                        iParams--;
                    }
                }
            }
            return(prms.ToArray());
        }
        /// <summary>
        /// Does the launch.
        /// </summary>
        /// <param name="gridSize">Size of the grid.</param>
        /// <param name="blockSize">Size of the block.</param>
        /// <param name="streamId">Stream id, or -1 for non-async.</param>
        /// <param name="gpuMethodInfo">The gpu method info.</param>
        /// <param name="arguments">The arguments.</param>
        protected override void DoLaunch(dim3 gridSize, dim3 blockSize, int streamId, KernelMethodInfo gpuMethodInfo, params object[] arguments)
        {
            if (streamId > -1 && !_streams.ContainsKey(streamId))
            {
                _streams.Add(streamId, streamId);
            }

            MethodInfo mi = gpuMethodInfo.Method;

            if (mi == null)
            {
                throw new CudafyHostException(CudafyHostException.csX_NOT_SET, gpuMethodInfo.Name);
            }
            object instance = mi.IsStatic ? null : Activator.CreateInstance(mi.DeclaringType);

            if (gpuMethodInfo.IsDummy)
            {
                object[] argsCopy = new object[arguments.Length];
                for (int i = 0; i < arguments.Length; i++)
                {
                    if (arguments[i].GetType().IsArray)
                    {
                        var v = TryGetDeviceMemory(arguments[i]) as EmuDevicePtrEx;
                        if (v != null)
                        {
                            if (v.Offset == 0)
                            {
                                argsCopy[i] = v.DevPtr;
                            }
                            else
                            {
                                throw new CudafyHostException(CudafyHostException.csX_NOT_CURRENTLY_SUPPORTED, "Offsets in arrays passed to dummy functions");
                            }
                        }
                        else
                        {
                            argsCopy[i] = arguments[i];
                        }
                    }
                    else
                    {
                        argsCopy[i] = arguments[i];
                    }
                }
                mi.Invoke(instance, argsCopy);
                return;
            }

            GGrid grid = new GGrid(gridSize);
            Dictionary <Array, EmuDevicePtrEx> dic;

            object[] pList = BuildParameterList2(mi, arguments, out dic);
            //object[] pListCopy = new object[0];
            if (gridSize.z > 1)
            {
                throw new CudafyHostException(CudafyHostException.csX_NOT_SUPPORTED, "3D grid sizes");
            }
            if (blockSize.z > 1)
            {
                throw new CudafyHostException(CudafyHostException.csX_NOT_SUPPORTED, "3D block sizes");
            }
            for (int x = 0; x < gridSize.x; x++)
            {
                for (int y = 0; y < gridSize.y; y++)
                {
                    int      totalSize = blockSize.x * blockSize.y * blockSize.z;
                    Thread[] threads   = new Thread[totalSize];
                    GBlock   blk2lnch  = new GBlock(grid, blockSize, x, y);
                    int      tCtr      = 0;

                    int pListLen = pList.Length;
                    for (int tx = 0; tx < blockSize.x; tx++)
                    {
                        for (int ty = 0; ty < blockSize.y; ty++)
                        {
                            GThread  ht        = new GThread(tx, ty, blk2lnch);
                            object[] pListCopy = new object[pListLen];
                            for (int pc = 0; pc < pListLen; pc++)
                            {
                                if (pList[pc] is GThread)
                                {
                                    pListCopy[pc] = ht;
                                }
                                else
                                {
                                    pListCopy[pc] = pList[pc];
                                }
                            }

#warning OPTIMIZATION if there is no synchronize then start and join threads in multiple of processor count - check this in disassembly and put flag in gpuMethodInfo
                            threads[tCtr] = new Thread(() =>
                            {
                                mi.Invoke(instance, pListCopy);
                            });

                            threads[tCtr].Name = string.Format("Grid_{0}_{1}_Thread_{2}_{3}", x, y, tx, ty);
                            threads[tCtr].Start();
                            //if (ctr % 16 == 0)
                            //    Console.WriteLine("Ctr=" + ctr.ToString());
                            //ctr++;
                            tCtr++;
                        }
                    }

                    for (int i = 0; i < totalSize; i++)
                    {
                        threads[i].Join();
                        //Console.WriteLine("Thread {0} exited.", threads[i].Name);
                    }
                }
            }


            int             iArgs   = 0;
            ParameterInfo[] piArray = mi.GetParameters();
            for (int iParams = 0; iParams < piArray.Length; iParams++)
            {
                ParameterInfo pi = piArray[iParams];
                if (pi.ParameterType == typeof(GThread))
                {
                    continue;
                }
                else if (iArgs < pList.Length)
                {
                    object o = pList[iArgs++];
                    if (!(o is GThread))
                    {
                        if (o.GetType().IsArray)
                        {
                            if (dic.ContainsKey(o as Array))
                            {
                                EmuDevicePtrEx ptrEx = dic[o as Array];
                                DoCopy(o as Array, 0, ptrEx.DevPtr, ptrEx.Offset, ptrEx.TotalSize, pi.ParameterType.GetElementType());
                            }
                        }
                    }
                    else
                    {
                        iParams--;
                    }
                }
            }
        }