private static byte[] Argon2d(byte[] pbMsg, byte[] pbSalt, uint uParallel,
			ulong uMem, ulong uIt, int cbOut, uint uVersion, byte[] pbSecretKey,
			byte[] pbAssocData)
		{
			pbSecretKey = (pbSecretKey ?? MemUtil.EmptyByteArray);
			pbAssocData = (pbAssocData ?? MemUtil.EmptyByteArray);

#if ARGON2_B2ROUND_ARRAYS
			InitB2RoundIndexArrays();
#endif

			Argon2Ctx ctx = new Argon2Ctx();
			ctx.Version = uVersion;

			ctx.Lanes = uParallel;
			ctx.TCost = uIt;
			ctx.MCost = uMem / NbBlockSize;
			ctx.MemoryBlocks = Math.Max(ctx.MCost, 2UL * NbSyncPoints * ctx.Lanes);

			ctx.SegmentLength = ctx.MemoryBlocks / (ctx.Lanes * NbSyncPoints);
			ctx.MemoryBlocks = ctx.SegmentLength * ctx.Lanes * NbSyncPoints;

			ctx.LaneLength = ctx.SegmentLength * NbSyncPoints;

			Debug.Assert(NbBlockSize == (NbBlockSizeInQW *
#if KeePassUAP
				(ulong)Marshal.SizeOf<ulong>()
#else
				(ulong)Marshal.SizeOf(typeof(ulong))
#endif
				));
			ctx.Mem = new ulong[ctx.MemoryBlocks * NbBlockSizeInQW];

			Blake2b h = new Blake2b();

			// Initial hash
			Debug.Assert(h.HashSize == (NbPreHashDigestLength * 8));
			byte[] pbBuf = new byte[4];
			MemUtil.UInt32ToBytesEx(uParallel, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			MemUtil.UInt32ToBytesEx((uint)cbOut, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			MemUtil.UInt32ToBytesEx((uint)ctx.MCost, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			MemUtil.UInt32ToBytesEx((uint)uIt, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			MemUtil.UInt32ToBytesEx(uVersion, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			MemUtil.UInt32ToBytesEx(0, pbBuf, 0); // Argon2d type = 0
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			MemUtil.UInt32ToBytesEx((uint)pbMsg.Length, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			h.TransformBlock(pbMsg, 0, pbMsg.Length, pbMsg, 0);
			MemUtil.UInt32ToBytesEx((uint)pbSalt.Length, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			h.TransformBlock(pbSalt, 0, pbSalt.Length, pbSalt, 0);
			MemUtil.UInt32ToBytesEx((uint)pbSecretKey.Length, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			h.TransformBlock(pbSecretKey, 0, pbSecretKey.Length, pbSecretKey, 0);
			MemUtil.UInt32ToBytesEx((uint)pbAssocData.Length, pbBuf, 0);
			h.TransformBlock(pbBuf, 0, pbBuf.Length, pbBuf, 0);
			h.TransformBlock(pbAssocData, 0, pbAssocData.Length, pbAssocData, 0);
			h.TransformFinalBlock(MemUtil.EmptyByteArray, 0, 0);
			byte[] pbH0 = h.Hash;
			Debug.Assert(pbH0.Length == 64);

			byte[] pbBlockHash = new byte[NbPreHashSeedLength];
			Array.Copy(pbH0, pbBlockHash, pbH0.Length);
			MemUtil.ZeroByteArray(pbH0);

			FillFirstBlocks(ctx, pbBlockHash, h);
			MemUtil.ZeroByteArray(pbBlockHash);

			FillMemoryBlocks(ctx);

			byte[] pbOut = FinalHash(ctx, cbOut, h);

			h.Clear();
			MemUtil.ZeroArray<ulong>(ctx.Mem);
			return pbOut;
		}
		private static byte[] FinalHash(Argon2Ctx ctx, int cbOut, Blake2b h)
		{
			ulong[] pqBlockHash = new ulong[NbBlockSizeInQW];
			CopyBlock(pqBlockHash, 0, ctx.Mem, (ctx.LaneLength - 1UL) *
				NbBlockSizeInQW);
			for(ulong l = 1; l < ctx.Lanes; ++l)
				XorBlock(pqBlockHash, 0, ctx.Mem, (l * ctx.LaneLength +
					ctx.LaneLength - 1UL) * NbBlockSizeInQW);

			byte[] pbBlockHashBytes = new byte[NbBlockSize];
			StoreBlock(pbBlockHashBytes, pqBlockHash);

			byte[] pbOut = new byte[cbOut];
			Blake2bLong(pbOut, cbOut, pbBlockHashBytes, (int)NbBlockSize, h);

			MemUtil.ZeroArray<ulong>(pqBlockHash);
			MemUtil.ZeroByteArray(pbBlockHashBytes);
			return pbOut;
		}
		private static ulong IndexAlpha(Argon2Ctx ctx, Argon2ThreadInfo ti,
			uint uPseudoRand, bool bSameLane)
		{
			ulong uRefAreaSize;
			if(ti.Pass == 0)
			{
				if(ti.Slice == 0)
				{
					Debug.Assert(ti.Index > 0);
					uRefAreaSize = ti.Index - 1UL;
				}
				else
				{
					if(bSameLane)
						uRefAreaSize = ti.Slice * ctx.SegmentLength +
							ti.Index - 1UL;
					else
						uRefAreaSize = ti.Slice * ctx.SegmentLength -
							((ti.Index == 0UL) ? 1UL : 0UL);
				}
			}
			else
			{
				if(bSameLane)
					uRefAreaSize = ctx.LaneLength - ctx.SegmentLength +
						ti.Index - 1UL;
				else
					uRefAreaSize = ctx.LaneLength - ctx.SegmentLength -
						((ti.Index == 0) ? 1UL : 0UL);
			}
			Debug.Assert(uRefAreaSize <= (ulong)uint.MaxValue);

			ulong uRelPos = uPseudoRand;
			uRelPos = (uRelPos * uRelPos) >> 32;
			uRelPos = uRefAreaSize - 1UL - ((uRefAreaSize * uRelPos) >> 32);

			ulong uStart = 0;
			if(ti.Pass != 0)
				uStart = (((ti.Slice + 1UL) == NbSyncPoints) ? 0UL :
					((ti.Slice + 1UL) * ctx.SegmentLength));
			Debug.Assert(uStart <= (ulong)uint.MaxValue);

			Debug.Assert(ctx.LaneLength <= (ulong)uint.MaxValue);
			return ((uStart + uRelPos) % ctx.LaneLength);
		}
		private static void FillMemoryBlocks(Argon2Ctx ctx)
		{
			int np = (int)ctx.Lanes;
			Argon2ThreadInfo[] v = new Argon2ThreadInfo[np];

			for(ulong r = 0; r < ctx.TCost; ++r)
			{
				for(ulong s = 0; s < NbSyncPoints; ++s)
				{
					for(int l = 0; l < np; ++l)
					{
						Argon2ThreadInfo ti = new Argon2ThreadInfo();
						ti.Context = ctx;

						ti.Pass = r;
						ti.Lane = (ulong)l;
						ti.Slice = s;

						if(!ThreadPool.QueueUserWorkItem(FillSegmentThr, ti))
						{
							Debug.Assert(false);
							throw new OutOfMemoryException();
						}

						v[l] = ti;
					}

					for(int l = 0; l < np; ++l)
					{
						v[l].Finished.WaitOne();
						v[l].Release();
					}
				}
			}
		}
		private static void FillFirstBlocks(Argon2Ctx ctx, byte[] pbBlockHash,
			Blake2b h)
		{
			byte[] pbBlock = new byte[NbBlockSize];

			for(ulong l = 0; l < ctx.Lanes; ++l)
			{
				MemUtil.UInt32ToBytesEx(0, pbBlockHash, NbPreHashDigestLength);
				MemUtil.UInt32ToBytesEx((uint)l, pbBlockHash, NbPreHashDigestLength + 4);

				Blake2bLong(pbBlock, (int)NbBlockSize, pbBlockHash,
					NbPreHashSeedLength, h);
				LoadBlock(ctx.Mem, l * ctx.LaneLength * NbBlockSizeInQW, pbBlock);

				MemUtil.UInt32ToBytesEx(1, pbBlockHash, NbPreHashDigestLength);

				Blake2bLong(pbBlock, (int)NbBlockSize, pbBlockHash,
					NbPreHashSeedLength, h);
				LoadBlock(ctx.Mem, (l * ctx.LaneLength + 1UL) * NbBlockSizeInQW, pbBlock);
			}

			MemUtil.ZeroByteArray(pbBlock);
		}