/// <summary> /// Create native code of cpuid checking function and wrap it into <see cref="NativeMethodHandle{TDelegate}"/>. /// </summary> /// <returns>cpuid support checking method cache.</returns> /// <exception cref="PlatformNotSupportedException">Throw when processor architecture is not x64 nor x86.</exception> public static NativeMethodHandle <IsCpuIdSupportedDelegate> CreateIsCpuIdSupportedMethodHandle() { if (!IsSupportedArchitecture()) { ThrowPlatformNotSupportedException("cpuid support checking function is not supported on this architecture"); } return(NativeMethodHandle.Create <IsCpuIdSupportedDelegate>(Environment.Is64BitProcess // bool __stdcall isCpuIdSupported(void); // The return value type may be 64bit value such as long long int ? new byte[] { 0x9c, // pushfq 0x9c, // pushfq 0x58, // pop rax 0x48, 0x89, 0xc1, // mov rcx,rax 0x48, 0x35, 0x00, 0x00, 0x20, 0x00, // xor rax,200000h 0x50, // push rax 0x9d, // popfq 0x9c, // pushfq 0x58, // pop rax 0x48, 0x31, 0xc8, // xor rax,rcx 0x48, 0xc1, 0xe8, 0x15, // shr rax,21 0x9d, // popfq 0xc3 // ret } // bool __stdcall isCpuIdSupported(void); // The return value type may be 32bit value such as int : new byte[] { 0x9c, // pushf 0x9c, // pushf 0x58, // pop eax 0x89, 0xc1, // mov ecx,eax 0x35, 0x00, 0x00, 0x20, 0x00, // xor eax,200000h 0x50, // push eax 0x9d, // popf 0x9c, // pushf 0x58, // pop eax 0x31, 0xc8, // xor eax,ecx 0xc1, 0xe8, 0x15, // shr eax,21 0x9d, // popf 0xc3, // ret })); }
/// <summary> /// Create native code of cpuid and wrap it into <see cref="NativeMethodHandle{TDelegate}"/>. /// </summary> /// <typeparam name="TDelegate">Delegate for native method of cpuid.</typeparam> /// <returns>Method handle of cpuid.</returns> /// <exception cref="PlatformNotSupportedException">Throw when processor architecture is not x64 nor x86.</exception> private static NativeMethodHandle <TDelegate> CreateCpuIdMethodHandle <TDelegate>() where TDelegate : Delegate { if (!IsSupportedArchitecture()) { ThrowPlatformNotSupportedException("cpuid is not supported on this architecture"); } return(NativeMethodHandle.Create <TDelegate>(Environment.Is64BitProcess // void __stdcall cpuid(int* cpuInfo, int eax, int ecx); ? new byte[] { 0x53, // push rbx 0x49, 0x89, 0xc9, // mov r9,rcx 0x89, 0xd0, // mov eax,edx 0x44, 0x89, 0xc1, // mov ecx,r8d 0x0f, 0xa2, // cpuid 0x41, 0x89, 0x01, // mov dword ptr [r9],eax 0x41, 0x89, 0x59, 0x04, // mov dword ptr [r9 + 04h],ebx 0x41, 0x89, 0x49, 0x08, // mov dword ptr [r9 + 08h],ecx 0x41, 0x89, 0x51, 0x0c, // mov dword ptr [r9 + 0ch],edx 0x5b, // pop rbx 0xc3 // ret } // void __stdcall cpuid(int* cpuInfo, int eax, int ecx); : new byte[] { 0x56, // push esi 0x53, // push ebx 0x8b, 0x74, 0x24, 0x0c, // mov esi,dword ptr [esp + 0Ch] 0x8b, 0x44, 0x24, 0x10, // mov eax,dword ptr [esp + 10h] 0x8b, 0x4c, 0x24, 0x14, // mov ecx,dword ptr [esp + 14h] 0x0f, 0xa2, // cpuid 0x89, 0x06, // mov dword ptr [esi],eax 0x89, 0x5e, 0x04, // mov dword ptr [esi + 04h],ebx 0x89, 0x4e, 0x08, // mov dword ptr [esi + 08h],ecx 0x89, 0x56, 0x0c, // mov dword ptr [esi + 0Ch],edx 0x5b, // pop ebx 0x5e, // pop esi 0xc2, 0x0c, 0x00 // ret 0Ch })); }
/// <summary> /// Create native method handle of memory compare function using AVX2. /// </summary> /// <returns>Created native method handle.</returns> private static NativeMethodHandle <CompareMemoryDelegate> CreateCompareMemoryAvx2MethodHandle() { return(NativeMethodHandle.Create <CompareMemoryDelegate>(Environment.Is64BitProcess ? new byte[] { 0x4d, 0x89, 0xc1, // mov r9,r8 0x49, 0x83, 0xe1, 0xe0, // and r9,0xffffffffffffffe0 0x74, 0x43, // je L4 0x45, 0x31, 0xd2, // xor r10d,r10d 0x31, 0xc0, // xor eax,eax 0xeb, 0x0c, // jmp L2 // L1: 0x41, 0x83, 0xc2, 0x20, // add r10d,0x20 0x49, 0x63, 0xc2, // movsxd rax,r10d 0x4c, 0x39, 0xc8, // cmp rax,r9 0x73, 0x2d, // jae L3 // L2: 0xc5, 0xfa, 0x6f, 0x14, 0x02, // vmovdqu xmm2,XMMWORD PTR [rdx+rax*1] 0xc5, 0xfa, 0x6f, 0x1c, 0x01, // vmovdqu xmm3,XMMWORD PTR [rcx+rax*1] 0xc4, 0xe3, 0x6d, 0x38, 0x44, 0x02, 0x10, // vinserti128 ymm0,ymm2,XMMWORD PTR [rdx+rax*1+0x10],0x1 0x01, 0xc4, 0xe3, 0x65, 0x38, 0x4c, 0x01, 0x10, // vinserti128 ymm1,ymm3,XMMWORD PTR [rcx+rax*1+0x10],0x1 0x01, 0xc5, 0xfd, 0x74, 0xc1, // vpcmpeqb ymm0,ymm0,ymm1 0xc5, 0xfd, 0xd7, 0xc0, // vpmovmskb eax,ymm0 0x83, 0xf8, 0xff, // cmp eax,0xffffffff 0x74, 0xcd, // je L1 0x31, 0xc0, // xor eax,eax 0xc5, 0xf8, 0x77, // vzeroupper 0xc3, // ret // L3: 0xc5, 0xf8, 0x77, // vzeroupper // L4: 0x49, 0x63, 0xc1, // movsxd rax,r9d 0x49, 0x39, 0xc0, // cmp r8,rax 0x77, 0x0f, // ja L7 // L5: 0xb8, 0x01, 0x00, 0x00, 0x00, // mov eax,0x1 0xc3, // ret // L6: 0x48, 0x83, 0xc0, 0x01, // add rax,0x1 0x49, 0x39, 0xc0, // cmp r8,rax 0x76, 0xf1, // jbe L5 // L7: 0x44, 0x0f, 0xb6, 0x1c, 0x02, // movzx r11d,BYTE PTR [rdx+rax*1] 0x44, 0x38, 0x1c, 0x01, // cmp BYTE PTR [rcx+rax*1],r11b 0x74, 0xec, // je L6 0x31, 0xc0, // xor eax,eax 0xc3 // ret } : new byte[] { 0x55, // push ebp 0x89, 0xe5, // mov ebp,esp 0x57, // push edi 0x56, // push esi 0x8b, 0x75, 0x10, // mov esi,DWORD PTR [ebp+0x10] 0x53, // push ebx 0x8b, 0x4d, 0x08, // mov ecx,DWORD PTR [ebp+0x8] 0x89, 0xf7, // mov edi,esi 0x8b, 0x5d, 0x0c, // mov ebx,DWORD PTR [ebp+0xc] 0x83, 0xe7, 0xe0, // and edi,0xffffffe0 0x74, 0x3f, // je L4 0x31, 0xd2, // xor edx,edx 0xeb, 0x07, // jmp L2 // L1: 0x83, 0xc2, 0x20, // add edx,0x20 0x39, 0xd7, // cmp edi,edx 0x76, 0x31, // jbe L3 // L2: 0xc5, 0xfa, 0x6f, 0x14, 0x13, // vmovdqu xmm2,XMMWORD PTR [ebx+edx*1] 0xc5, 0xfa, 0x6f, 0x1c, 0x11, // vmovdqu xmm3,XMMWORD PTR [ecx+edx*1] 0xc4, 0xe3, 0x6d, 0x38, 0x44, 0x13, 0x10, // vinserti128 ymm0,ymm2,XMMWORD PTR [ebx+edx*1+0x10],0x1 0x01, 0xc4, 0xe3, 0x65, 0x38, 0x4c, 0x11, 0x10, // vinserti128 ymm1,ymm3,XMMWORD PTR [ecx+edx*1+0x10],0x1 0x01, 0xc5, 0xfd, 0x74, 0xc1, // vpcmpeqb ymm0,ymm0,ymm1 0xc5, 0xfd, 0xd7, 0xc0, // vpmovmskb eax,ymm0 0x83, 0xf8, 0xff, // cmp eax,0xffffffff 0x74, 0xd2, // je L1 0x31, 0xc0, // xor eax,eax 0xc5, 0xf8, 0x77, // vzeroupper 0x5b, // pop ebx 0x5e, // pop esi 0x5f, // pop edi 0x5d, // pop ebp 0xc3, // ret // L3: 0xc5, 0xf8, 0x77, // vzeroupper // L4: 0x89, 0xf8, // mov eax,edi 0x39, 0xfe, // cmp esi,edi 0x77, 0x11, // ja L7 // L5: 0x5b, // pop ebx 0xb8, 0x01, 0x00, 0x00, 0x00, // mov eax,0x1 0x5e, // pop esi 0x5f, // pop edi 0x5d, // pop ebp 0xc3, // ret // L6: 0x83, 0xc0, 0x01, // add eax,0x1 0x39, 0xc6, // cmp esi,eax 0x76, 0xef, // jbe L5 // L7: 0x0f, 0xb6, 0x14, 0x03, // movzx edx,BYTE PTR [ebx+eax*1] 0x38, 0x14, 0x01, // cmp BYTE PTR [ecx+eax*1],dl 0x74, 0xf0, // je L6 0x5b, // pop ebx 0x31, 0xc0, // xor eax,eax 0x5e, // pop esi 0x5f, // pop edi 0x5d, // pop ebp 0xc3 // ret })); }
/// <summary> /// Create native code method that calculate inner product of two <see cref="float"/> arrays. /// </summary> /// <returns>Native code method of inner product calculation</returns> private NativeMethodHandle <InnerProductDelegate> CreateInnerProductMethodHandle() { // void __stdcall // inner_product( // float* dst, // const float* src1, // const float* src2, // int size) noexcept // { // constexpr auto stride = static_cast<int>(sizeof(__m128) / sizeof(float)); // // for (int i = 0, im = size - stride; i <= im; i += stride) { // _mm_storeu_ps( // &dst[i], // _mm_mul_ps( // _mm_loadu_ps(&src1[i]), // _mm_loadu_ps(&src2[i]))); // } // // const auto remSize = size % stride; // if (remSize == 0) { // return; // } // // const auto offset = size - remSize; // for (int i = offset; i < size; i++) { // dst[i] = src1[i] * src2[i]; // } // } return(NativeMethodHandle.Create <InnerProductDelegate>(Environment.Is64BitProcess ? new byte[] { 0x41, 0x83, 0xf9, 0x03, // cmp r9d,0x3 0x7e, 0x33, // jle L2 0x45, 0x8d, 0x51, 0xfc, // lea r10d,[r9-0x4] 0x31, 0xc0, // xor eax,eax 0x41, 0xc1, 0xea, 0x02, // shr r10d,0x2 0x49, 0x83, 0xc2, 0x01, // add r10,0x1 0x49, 0xc1, 0xe2, 0x04, // shl r10,0x4 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, // nop DWORD PTR [rax+rax*1+0x0] 0x00, // L1: 0x41, 0x0f, 0x10, 0x04, 0x00, // movups xmm0,XMMWORD PTR [r8+rax*1] 0x0f, 0x10, 0x0c, 0x02, // movups xmm1,XMMWORD PTR [rdx+rax*1] 0x0f, 0x59, 0xc1, // mulps xmm0,xmm1 0x0f, 0x11, 0x04, 0x01, // movups XMMWORD PTR [rcx+rax*1],xmm0 0x48, 0x83, 0xc0, 0x10, // add rax,0x10 0x49, 0x39, 0xc2, // cmp r10,rax 0x75, 0xe7, // jne L1 // L2: 0x45, 0x89, 0xca, // mov r10d,r9d 0x41, 0xc1, 0xfa, 0x1f, // sar r10d,0x1f 0x41, 0xc1, 0xea, 0x1e, // shr r10d,0x1e 0x43, 0x8d, 0x04, 0x11, // lea eax,[r9+r10*1] 0x83, 0xe0, 0x03, // and eax,0x3 0x44, 0x29, 0xd0, // sub eax,r10d 0x74, 0x49, // je L4 0x45, 0x89, 0xca, // mov r10d,r9d 0x41, 0x29, 0xc2, // sub r10d,eax 0x45, 0x39, 0xd1, // cmp r9d,r10d 0x7e, 0x3e, // jle L4 0x41, 0x83, 0xe9, 0x01, // sub r9d,0x1 0x4d, 0x63, 0xda, // movsxd r11,r10d 0x45, 0x29, 0xd1, // sub r9d,r10d 0x4a, 0x8d, 0x04, 0x9d, 0x00, 0x00, 0x00, // lea rax,[r11*4+0x0] 0x00, 0x4f, 0x8d, 0x4c, 0x0b, 0x01, // lea r9,[r11+r9*1+0x1] 0x49, 0xc1, 0xe1, 0x02, // shl r9,0x2 0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, // nop WORD PTR cs:[rax+rax*1+0x0] 0x00, 0x00, 0x00, // L3: 0xf3, 0x0f, 0x10, 0x04, 0x02, // movss xmm0,DWORD PTR [rdx+rax*1] 0xf3, 0x41, 0x0f, 0x59, 0x04, 0x00, // mulss xmm0,DWORD PTR [r8+rax*1] 0xf3, 0x0f, 0x11, 0x04, 0x01, // movss DWORD PTR [rcx+rax*1],xmm0 0x48, 0x83, 0xc0, 0x04, // add rax,0x4 0x4c, 0x39, 0xc8, // cmp rax,r9 0x75, 0xe7, // jne L3 // L4: 0xc3, // ret } : new byte[] { 0x55, // push ebp 0x89, 0xe5, // mov ebp,esp 0x57, // push edi 0x56, // push esi 0x8b, 0x75, 0x14, // mov esi,DWORD PTR [ebp+0x14] 0x53, // push ebx 0x8b, 0x55, 0x08, // mov edx,DWORD PTR [ebp+0x8] 0x8b, 0x5d, 0x0c, // mov ebx,DWORD PTR [ebp+0xc] 0x83, 0xe4, 0xf0, // and esp,0xfffffff0 0x8b, 0x4d, 0x10, // mov ecx,DWORD PTR [ebp+0x10] 0x83, 0xfe, 0x03, // cmp esi,0x3 0x7e, 0x21, // jle L2 0x8d, 0x7e, 0xfc, // lea edi,[esi-0x4] 0x31, 0xc0, // xor eax,eax 0x83, 0xe7, 0xfc, // and edi,0xfffffffc 0x83, 0xc7, 0x04, // add edi,0x4 // L1: 0x0f, 0x10, 0x04, 0x81, // movups xmm0,XMMWORD PTR [ecx+eax*4] 0x0f, 0x10, 0x0c, 0x83, // movups xmm1,XMMWORD PTR [ebx+eax*4] 0x0f, 0x59, 0xc1, // mulps xmm0,xmm1 0x0f, 0x11, 0x04, 0x82, // movups XMMWORD PTR [edx+eax*4],xmm0 0x83, 0xc0, 0x04, // add eax,0x4 0x39, 0xf8, // cmp eax,edi 0x75, 0xea, // jne L1 // L2: 0x89, 0xf7, // mov edi,esi 0xc1, 0xff, 0x1f, // sar edi,0x1f 0xc1, 0xef, 0x1e, // shr edi,0x1e 0x8d, 0x04, 0x3e, // lea eax,[esi+edi*1] 0x83, 0xe0, 0x03, // and eax,0x3 0x29, 0xf8, // sub eax,edi 0x74, 0x2a, // je L4 0x89, 0xf7, // mov edi,esi 0x29, 0xc7, // sub edi,eax 0x39, 0xfe, // cmp esi,edi 0x7e, 0x22, // jle L4 0xc1, 0xe7, 0x02, // shl edi,0x2 0x8d, 0x04, 0x3b, // lea eax,[ebx+edi*1] 0x8d, 0x1c, 0xb3, // lea ebx,[ebx+esi*4] 0x01, 0xf9, // add ecx,edi 0x01, 0xfa, // add edx,edi // L3: 0xd9, 0x00, // fld DWORD PTR [eax] 0x83, 0xc0, 0x04, // add eax,0x4 0x83, 0xc1, 0x04, // add ecx,0x4 0x83, 0xc2, 0x04, // add edx,0x4 0xd8, 0x49, 0xfc, // fmul DWORD PTR [ecx-0x4] 0xd9, 0x5a, 0xfc, // fstp DWORD PTR [edx-0x4] 0x39, 0xd8, // cmp eax,ebx 0x75, 0xeb, // jne L3 // L4: 0x8d, 0x65, 0xf4, // lea esp,[ebp-0xc] 0x5b, // pop ebx 0x5e, // pop esi 0x5f, // pop edi 0x5d, // pop ebp 0xc2, 0x10, 0x00, // ret 0x10 })); }