/*
 * Copyright (c) 2020 Bitdefender
 * SPDX-License-Identifier: Apache-2.0
 */
//
// bdshemu.c
//

#include "../inc/bdshemu.h"
#include "../bddisasm/include/bddisasm_crt.h"
#include "include/bdshemu_common.h"

#if defined(ND_ARCH_X64) || defined(ND_ARCH_X86)
#ifdef __clang__
#include <wmmintrin.h>
#else
#include <immintrin.h>
#endif // __clang__
#endif // defined(ND_ARCH_X64) || defined(ND_ARCH_X86)

//
// A generic emulator value.
//
typedef struct _SHEMU_VALUE
{
    union
    {
        ND_UINT8        Bytes[ND_MAX_REGISTER_SIZE];
        ND_UINT16       Words[ND_MAX_REGISTER_SIZE / sizeof(ND_UINT16)];
        ND_UINT32       Dwords[ND_MAX_REGISTER_SIZE / sizeof(ND_UINT32)];
        ND_UINT64       Qwords[ND_MAX_REGISTER_SIZE / sizeof(ND_UINT64)];

        struct
        {
            ND_UINT16   FpuControlWord;
            ND_UINT16   FpuStatusWord;
            ND_UINT16   FpuTagWord;
            ND_UINT16   Rsvd;
            ND_UINT32   FpuDataPointer;
            ND_UINT32   FpuInstructionPointer;
            ND_UINT32   FpuLastInstructionOpcode;
        } FpuEnvironment;

        struct
        {
            ND_UINT16   FpuControlWord;
            ND_UINT16   FpuStatuwsWord;
            ND_UINT16   FpuTagWord;
            ND_UINT16   FpuOpcode;
            ND_UINT64   FpuRip;
            ND_UINT64   FpuDataPointer;
            ND_UINT32   Mxcsr;
            ND_UINT32   MxcsrMask;
        } XsaveArea;

        struct
        {
            ND_UINT16   Limit;
            ND_UINT64   Base;
        } Descriptor;

    } Value;

    ND_OPERAND_SIZE     Size;
} SHEMU_VALUE, *PSHEMU_VALUE;


enum
{
    FM_LOGIC,
    FM_SHL,
    FM_SHR,
    FM_SAR,
    FM_SUB,
    FM_ADD,
} FLAGS_MODE;


#define GET_OP(ctx, op, val) {                                                                                         \
    shstatus = ShemuX86GetOperandValue(ctx, op, val);                                                                  \
    if (SHEMU_SUCCESS != shstatus)                                                                                     \
    {                                                                                                                  \
        return shstatus;                                                                                               \
    }                                                                                                                  \
}

#define SET_OP(ctx, op, val) {                                                                                         \
    shstatus = ShemuX86SetOperandValue(ctx, op, val);                                                                  \
    if (SHEMU_SUCCESS != shstatus)                                                                                     \
    {                                                                                                                  \
        return shstatus;                                                                                               \
    }                                                                                                                  \
}

#define GET_FLAG(ctx, flg)                  (!!((ctx)->Arch.X86.Registers.RegFlags & (flg)))
#define SET_FLAG(ctx, flg, val)             ShemuX86SetFlag(ctx, flg, val)
#define SET_FLAGS(ctx, dst, src1, src2, fm) ShemuX86SetFlags(ctx, dst.Value.Qwords[0], src1.Value.Qwords[0],           \
                                                             src2.Value.Qwords[0], dst.Size, fm)


//
// ShemuX86SetFlag
//
static void
ShemuX86SetFlag(
    SHEMU_CONTEXT *Context,
    ND_UINT64 Flag,
    ND_UINT64 Value
    )
{
    // {NF} present for instruction, no flags will be modified.
    if (Context->Arch.X86.Instruction.HasNf)
    {
        return;
    }

    if (Value)
    {
        Context->Arch.X86.Registers.RegFlags |= Flag;
    }
    else
    {
        Context->Arch.X86.Registers.RegFlags &= ~Flag;
    }
}


//
// ShemuX86SetFlags
//
static void
ShemuX86SetFlags(
    SHEMU_CONTEXT *Context,
    ND_UINT64 Dst,
    ND_UINT64 Src1,
    ND_UINT64 Src2,
    ND_OPERAND_SIZE Size,
    ND_UINT8 FlagsMode
    )
{
    ND_UINT8 pfArr[16] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };

    // {NF} present for instruction, no flags will be modified.
    if (Context->Arch.X86.Instruction.HasNf)
    {
        return;
    }

    // Mask the operands with their respective size.
    Dst = ND_TRIM(Size, Dst);
    Src1 = ND_TRIM(Size, Src1);
    Src2 = ND_TRIM(Size, Src2);

    if (FlagsMode == FM_SHL || FlagsMode == FM_SHR || FlagsMode == FM_SAR)
    {
        // Shift with 0 count does not affect flags.
        if (Src2 == 0)
        {
            return;
        }
    }

    // PF set if the first bytes has an even number of 1 bits.
    if ((pfArr[Dst & 0xF] + pfArr[(Dst >> 4) & 0xF]) % 2 == 0)
    {
        Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_PF;
    }
    else
    {
        Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_PF;
    }

    // ZF set if the result is zero.
    if (Dst == 0)
    {
        Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_ZF;
    }
    else
    {
        Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_ZF;
    }

    // SF is set if the sign flag is set.
    if (ND_GET_SIGN(Size, Dst) != 0)
    {
        Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_SF;
    }
    else
    {
        Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_SF;
    }

    // OF and CF are handled differently for some instructions.
    if (FM_LOGIC == FlagsMode)
    {
        // OF and CF are cleared on logic instructions.
        Context->Arch.X86.Registers.RegFlags &= ~(NDR_RFLAG_OF | NDR_RFLAG_CF);
    }
    else if (FM_SHL == FlagsMode)
    {
        // CF is the last bit shifted out of the destination.
        if ((Src2 <= Size * 8ULL) && ND_GET_BIT((Size * 8ULL) - Src2, Src1))
        {
            Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_CF;
        }
        else
        {
            Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_CF;
        }

        if (Src2 == 1)
        {
            if (ND_GET_BIT(Size * 8ULL - 1, Src1) ^ ND_GET_BIT(Size * 8ULL - 2, Src1))
            {
                Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_OF;
            }
            else
            {
                Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_OF;
            }
        }
    }
    else if (FM_SHR == FlagsMode)
    {
        // CF is the last bit shifted out of the destination.
        // Src2 - 1 is ok - this function does not get called if Src2 is 0.
        if (ND_GET_BIT(Src2 - 1, Src1))
        {
            Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_CF;
        }
        else
        {
            Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_CF;
        }

        if (Src2 == 1)
        {
            if (ND_GET_BIT(Size * 8ULL - 1, Src1))
            {
                Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_OF;
            }
            else
            {
                Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_OF;
            }
        }
    }
    else if (FM_SAR == FlagsMode)
    {
        // CF is the last bit shifted out of the destination. In case of SAR, if the shift ammount exceeds the operand
        // size, CF will be 1 if the result is -1, or 0 if the result is 0.
        // Src2 - 1 is ok - this function does not get called if Src2 is 0.
        if (ND_GET_BIT(Src2 - 1, Src1) || ((Src2 >= (ND_UINT64)Size * 8) && Dst != 0))
        {
            Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_CF;
        }
        else
        {
            Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_CF;
        }

        Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_OF;
    }
    else
    {
        // Set CF.
        if ((FM_SUB == FlagsMode) && ((Src1 < Src2) || (Src1 == Src2 && Dst != 0)))
        {
            Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_CF;
        }
        else if ((FM_ADD == FlagsMode) && ((Dst < Src1) || (Dst == Src1 && Src2 != 0)))
        {
            Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_CF;
        }
        else
        {
            Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_CF;
        }

        // Set OF.
        if (FM_SUB == FlagsMode)
        {
            if ((ND_GET_SIGN(Size, Src1) && !ND_GET_SIGN(Size, Src2) && !ND_GET_SIGN(Size, Dst)) ||
                (!ND_GET_SIGN(Size, Src1) && ND_GET_SIGN(Size, Src2) && ND_GET_SIGN(Size, Dst)))
            {
                Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_OF;
            }
            else
            {
                Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_OF;
            }
        }
        else if (FM_ADD == FlagsMode)
        {
            if (ND_GET_SIGN(Size, Src1) == ND_GET_SIGN(Size, Src2) && 
                ND_GET_SIGN(Size, Src1) != ND_GET_SIGN(Size, Dst))
            {
                Context->Arch.X86.Registers.RegFlags |= NDR_RFLAG_OF;
            }
            else
            {
                Context->Arch.X86.Registers.RegFlags &= ~NDR_RFLAG_OF;
            }
        }
    }
}


//
// ShemuX86EvalCondition
//
static ND_BOOL
ShemuX86EvalCondition(
    SHEMU_CONTEXT *Context,
    ND_UINT8 ConditionCode
    )
{
    switch (ConditionCode)
    {
    case ND_COND_OVERFLOW: // O
        if (GET_FLAG(Context, NDR_RFLAG_OF) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_OVERFLOW): // NO
        if (GET_FLAG(Context, NDR_RFLAG_OF) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_CARRY: // C/B/NAE
        if (GET_FLAG(Context, NDR_RFLAG_CF) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_CARRY): // NC/NB/AE
        if (GET_FLAG(Context, NDR_RFLAG_CF) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_ZERO: // E/Z
        if (GET_FLAG(Context, NDR_RFLAG_ZF) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_ZERO): // NE/NZ
        if (GET_FLAG(Context, NDR_RFLAG_ZF) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_BELOW_OR_EQUAL: // BE/NA
        if ((GET_FLAG(Context, NDR_RFLAG_CF) | (GET_FLAG(Context, NDR_RFLAG_ZF))) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_BELOW_OR_EQUAL): // A/NBE
        if ((GET_FLAG(Context, NDR_RFLAG_CF) | (GET_FLAG(Context, NDR_RFLAG_ZF))) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_SIGN: // S
        if (GET_FLAG(Context, NDR_RFLAG_SF) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_SIGN): // NS
        if (GET_FLAG(Context, NDR_RFLAG_SF) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_PARITY: // P
        if (GET_FLAG(Context, NDR_RFLAG_PF) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_PARITY): // NP
        if (GET_FLAG(Context, NDR_RFLAG_PF) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_LESS: // L/NGE
        if ((GET_FLAG(Context, NDR_RFLAG_SF) ^ GET_FLAG(Context, NDR_RFLAG_OF)) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_LESS): // NL/GE
        if ((GET_FLAG(Context, NDR_RFLAG_SF) ^ GET_FLAG(Context, NDR_RFLAG_OF)) == 0)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_LESS_OR_EQUAL: // LE/NG
        if (((GET_FLAG(Context, NDR_RFLAG_SF) ^ GET_FLAG(Context, NDR_RFLAG_OF)) |
            (GET_FLAG(Context, NDR_RFLAG_ZF))) == 1)
        {
            return ND_TRUE;
        }
        break;
    case ND_COND_NOT(ND_COND_LESS_OR_EQUAL): // NLE/G
        if (((GET_FLAG(Context, NDR_RFLAG_SF) ^ GET_FLAG(Context, NDR_RFLAG_OF)) |
            (GET_FLAG(Context, NDR_RFLAG_ZF))) == 0)
        {
            return ND_TRUE;
        }
        break;
    }

    return ND_FALSE;
}


//
// ShemuX86GetGprValue
//
static ND_UINT64
ShemuX86GetGprValue(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Reg,
    ND_UINT32 Size,
    ND_BOOL High8
    )
{
    if (High8 && Reg >= 4)
    {
        Reg = Reg - 4;
    }

    // Any read of the GPR, before being modified, counts as being "saved". The reason here is that we cannot
    // easily track a proper save in memory, as the register may be copied in another register, and only then
    // saved.
    if (!(Context->Arch.X86.GprTracker[Reg] & GPR_TRACK_DIRTY))
    {
        Context->Arch.X86.GprTracker[Reg] |= GPR_TRACK_READ;
    }

    switch (Size)
    {
    case 1:
        if (High8)
        {
            // AH, CH, DH or BH accessed.
            return (*(&Context->Arch.X86.Registers.RegRax + Reg) >> 8) & 0xff;
        }
        else
        {
            return *(&Context->Arch.X86.Registers.RegRax + Reg) & 0xff;
        }

    case 2:
        return *(&Context->Arch.X86.Registers.RegRax + Reg) & 0xffff;

    case 4:
        return *(&Context->Arch.X86.Registers.RegRax + Reg) & 0xffffffff;

    default:
        return *(&Context->Arch.X86.Registers.RegRax + Reg);
    }
}


//
// ShemuX86GetGprValue
//
static void
ShemuX86SetGprValue(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Reg,
    ND_UINT32 Size,
    ND_UINT64 Value,
    ND_BOOL High8
    )
{
    if (High8 && Reg >= 4)
    {
        Reg = Reg - 4;
    }

    // If ZeroUpper semantic is enabled for APX instruction, make sure to zero out the destination.
    // {ZU} can be either explicit (example: IMUL), or implicit (example: any {ND} instruction).
    if (Context->Arch.X86.Instruction.HasZu || Context->Arch.X86.Instruction.HasNd)
    {
        *(&Context->Arch.X86.Registers.RegRax + Reg) = 0;
    }

    switch (Size)
    {
    case 1:
        if (High8)
        {
            // AH, CH, DH or BH accessed.
            *((ND_UINT8 *)(&Context->Arch.X86.Registers.RegRax + Reg) + 1) = Value & 0xFF;
        }
        else
        {
            *((ND_UINT8 *)(&Context->Arch.X86.Registers.RegRax + Reg)) = Value & 0xff;
        }
        break;

    case 2:
        *((ND_UINT16 *)(&Context->Arch.X86.Registers.RegRax + Reg)) = Value & 0xffff;
        break;

    case 4:
        // Higher ND_UINT32 is always set to zero.
        *(&Context->Arch.X86.Registers.RegRax + Reg) = Value & 0xffffffff;
        break;

    default:
        *(&Context->Arch.X86.Registers.RegRax + Reg) = Value;
        break;
    }

    // Mark the GPR as being dirty/written.
    if (!(Context->Arch.X86.GprTracker[Reg] & GPR_TRACK_READ))
    {
        Context->Arch.X86.GprTracker[Reg] |= GPR_TRACK_DIRTY;
    }
}


//
// ShemuX86GetSegValue
//
static ND_UINT64
ShemuX86GetSegValue(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Reg
    )
{
    switch (Reg)
    {
    case NDR_ES:
        return Context->Arch.X86.Segments.Es.Selector;
    case NDR_CS:
        return Context->Arch.X86.Segments.Cs.Selector;
    case NDR_SS:
        return Context->Arch.X86.Segments.Ss.Selector;
    case NDR_DS:
        return Context->Arch.X86.Segments.Ds.Selector;
    case NDR_FS:
        return Context->Arch.X86.Segments.Fs.Selector;
    case NDR_GS:
        return Context->Arch.X86.Segments.Gs.Selector;
    }

    return 0;
}


//
// ShemuX86SetSegValue
//
static void
ShemuX86SetSegValue(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Reg,
    ND_UINT16 Value
    )
{
    switch (Reg)
    {
    case NDR_ES:
        Context->Arch.X86.Segments.Es.Selector = Value;
        break;
    case NDR_CS:
        Context->Arch.X86.Segments.Cs.Selector = Value;
        break;
    case NDR_SS:
        Context->Arch.X86.Segments.Ss.Selector = Value;
        break;
    case NDR_DS:
        Context->Arch.X86.Segments.Ds.Selector = Value;
        break;
    case NDR_FS:
        Context->Arch.X86.Segments.Fs.Selector = Value;
        break;
    case NDR_GS:
        Context->Arch.X86.Segments.Gs.Selector = Value;
        break;
    }
}


//
// ShemuX86GetSegBase
//
static ND_UINT64
ShemuX86GetSegBase(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Reg
    )
{
    switch (Reg)
    {
    case NDR_ES:
        return Context->Arch.X86.Segments.Es.Base;
    case NDR_CS:
        return Context->Arch.X86.Segments.Cs.Base;
    case NDR_SS:
        return Context->Arch.X86.Segments.Ss.Base;
    case NDR_DS:
        return Context->Arch.X86.Segments.Ds.Base;
    case NDR_FS:
        return Context->Arch.X86.Segments.Fs.Base;
    case NDR_GS:
        return Context->Arch.X86.Segments.Gs.Base;
    }

    return 0;
}


//
// ShemuX86IsSelectorValid
//
static ND_BOOL
ShemuX86IsSelectorValid(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Reg,
    ND_UINT16 Value
    )
{
    // Allow NULL selectors in 64 bit mode inside ES, DS, FS, GS.
    if ((Value == 0) && (Context->Arch.X86.Mode == ND_CODE_64) && (Reg != NDR_SS))
    {
        return ND_TRUE;
    }

    if ((Value & 0xFFF8) >= 0x80)
    {
        // Too large value.
        return ND_FALSE;
    }

    if ((Value & 4) != 0)
    {
        // LDT selector.
        return ND_FALSE;
    }

    if ((Context->Arch.X86.Ring == 0) && ((Value & 3) != 0))
    {
        // Ring 0 selector, with RPL != 0.
        return ND_FALSE;
    }

    if ((Context->Arch.X86.Ring == 3) && ((Value & 3) != 3))
    {
        // Ring 3 selector, with RPL != 3.
        return ND_FALSE;
    }

    return ND_TRUE;
}


//
// ShemuX86IsAddressAligned
//
static ND_BOOL
ShemuX86IsAddressAligned(
    SHEMU_CONTEXT *Context,
    ND_UINT64 Address
    )
{
    if (Context->Arch.X86.Mode == ND_CODE_64)
    {
        return Address % 8 == 0;
    }
    else
    {
        return Address % 4 == 0;
    }
}


//
// ShemuX86TrackLoopStart
//
static void
ShemuX86TrackLoopStart(
    SHEMU_CONTEXT *Context,
    ND_UINT64 Rip,
    ND_UINT64 Target
    )
{
    if (Context->LoopTrack.Active)
    {
        return;
    }

    Context->LoopTrack.Active = ND_TRUE;
    Context->LoopTrack.Iteration = 1;
    Context->LoopTrack.Address = Rip;
    Context->LoopTrack.Target = Target;

    if (!!(Context->Options & SHEMU_OPT_TRACE_LOOPS))
    {
        shemu_printf(Context, "        Loop BEGIN from RIP 0x%016llx, TARGET 0x%016llx, ITER %llu\n",
                     Context->LoopTrack.Address, Context->LoopTrack.Target, Context->LoopTrack.Iteration);
    }
}


//
// ShemuX86TrackLoopStop
//
static void
ShemuX86TrackLoopStop(
    SHEMU_CONTEXT *Context
    )
{
    if (!Context->LoopTrack.Active)
    {
        return;
    }

    if (!!(Context->Options & SHEMU_OPT_TRACE_LOOPS))
    {
        shemu_printf(Context, "        Loop BREAK from RIP 0x%016llx, TARGET 0x%016llx, ITER %llu\n",
                     Context->LoopTrack.Address, Context->LoopTrack.Target, Context->LoopTrack.Iteration);
    }

    Context->LoopTrack.Active = ND_FALSE;
    Context->LoopTrack.Address = 0;
    Context->LoopTrack.Target = 0;
    Context->LoopTrack.Iteration = 0;
}


//
// ShemuX86TrackLoop
//
static ND_BOOL
ShemuX86TrackLoop(
    SHEMU_CONTEXT *Context,
    ND_BOOL Taken
    )
{
    ND_UINT64 rip, target;

    // The address of the loop instruction itself, not the next instruction.
    rip = Context->Arch.X86.Registers.RegRip - Context->Arch.X86.Instruction.Length;

    // The branches we monitor always have a relative offset, which is the first operand.
    target = Context->Arch.X86.Registers.RegRip + Context->Arch.X86.Instruction.Operands[0].Info.RelativeOffset.Rel;

    if (target < rip)
    {
        // Case 1: Backward branch.
        if (Taken)
        {
            // Taken backward branch instruction.
            if (rip == Context->LoopTrack.Address)
            {
                // Current active loop, this is a new iteration. Example:
                // label1:                  <--+
                //      ...                    |
                //      loop    label1      ---+
                //      ...
                // Update loop information.
                Context->LoopTrack.Iteration++;
            }
            else
            {
                // New loop, or loop within our current loop. Always track the inner most loop. Example:
                // label1:                  <--+
                //      ...                    |
                // label2:              <--+   |
                //      ...                |   |
                //      loop    label2  ---+   |
                //      ...                    |
                //      loop    label1      ---+
                //      ...
                // Start tracking the (potentially new) loop.
                ShemuX86TrackLoopStop(Context);

                ShemuX86TrackLoopStart(Context, rip, target);
            }
        }
        else
        {
            // Not taken backward branch instruction.
            if (rip == Context->LoopTrack.Address)
            {
                // Current loop ends, the loop instruction is no longer taken. Example:
                // label1:
                //      ...
                //      loop    label1      ---+
                //      ...                 <--+
                // Stop tracking.
                ShemuX86TrackLoopStop(Context);
            }
            else
            {
                // Not taken backward branch, but not our current loop.
                // Nothing to do.
            }
        }
    }
    else
    {
        // Case 2: Forward branch.
        if (Taken)
        {
            // Taken forward branch instruction.
            if (target < Context->LoopTrack.Target || target > Context->LoopTrack.Address)
            {
                // Break instruction. Example:
                // label1:
                //      ...
                //      jnz     label2      ---+
                //      ...                    |
                //      loop    label1         |
                //      ...                    |
                // label2:                  <--+
                //      ...
                // Stop tracking.
                ShemuX86TrackLoopStop(Context);
            }
            else
            {
                // If instruction. Example:
                // label1:
                //      ...
                //      jnz     label2      ---+
                //      ...                    |
                // label2:                  <--+
                //      ...
                //      loop    label1
                //      ...
                // Nothing to do.
            }
        }
        else
        {
            // Not taken forward branch instruction.
            // Nothing to do.
        }
    }

    // In theory, we can override whether the branch is taken or not, but for now, this is not used.
    return Taken;
}


//
// ShemuX86ComputeLinearAddress
//
static ND_UINT64
ShemuX86ComputeLinearAddress(
    SHEMU_CONTEXT *Context,
    PND_OPERAND Operand
    )
{
    ND_UINT64 gla = 0;

    if (Operand->Info.Memory.HasBase)
    {
        gla += ShemuX86GetGprValue(Context, Operand->Info.Memory.Base, Operand->Info.Memory.BaseSize, ND_FALSE);
    }

    if (Operand->Info.Memory.HasIndex)
    {
        gla += ShemuX86GetGprValue(Context, Operand->Info.Memory.Index, Operand->Info.Memory.IndexSize, ND_FALSE) *
                                Operand->Info.Memory.Scale;
    }

    // Note that this also handles the case where moffset (absolute addressing) is used, as HasDisp will be set when
    // IsDirect is also set.
    if (Operand->Info.Memory.HasDisp)
    {
        gla += Operand->Info.Memory.Disp;
    }

    if (Operand->Info.Memory.IsRipRel)
    {
        gla += Context->Arch.X86.Registers.RegRip;
    }

    // Special handling for BT, BTR, BTS, BTC instructions with bitbase addressing.
    if (Operand->Info.Memory.IsBitbase)
    {
        ND_UINT64 bitbase, op1size, op2size, reg;

        op1size = Context->Arch.X86.Instruction.Operands[0].Size;
        op2size = Context->Arch.X86.Instruction.Operands[1].Size;

        reg = ((ND_UINT64*)&Context->Arch.X86.Registers.RegRax)[Context->Arch.X86.Instruction.Operands[1].Info.Register.Reg];

        // Note: only BT* with register source (NOT immediate) support bitbase addressing.
        bitbase = ND_SIGN_EX(op2size, reg);

        if (bitbase & (1ULL << 63))
        {
            gla -= ((~bitbase >> 3) & ~(op1size - 1)) + op1size;
        }
        else
        {
            gla += (bitbase >> 3) & ~(op1size - 1);
        }
    }

    // Special handling for stack operations: if we have a PUSH, we have to subtract the accessed size, as, in fact,
    // [RSP - size] is accessed, not [RSP].
    if (Operand->Info.Memory.IsStack)
    {
        if (Operand->Access.Write || Operand->Access.CondWrite)
        {
            gla -= Operand->Size;
        }
    }

    // Make sure we truncate the linear address to the address size.
    switch (Context->Arch.X86.Instruction.AddrMode)
    {
    case ND_ADDR_32:
        gla &= 0xFFFFFFFF;
        break;
    case ND_ADDR_16:
        gla &= 0xFFFF;
    default:
        break;
    }

    // Memory operands usually have a segment. Note that we don't care about any segment checks, since we're most
    // likely be provided with flat segments. If checks should be needed, dedicated callbacks should be added.
    if (Operand->Info.Memory.HasSeg)
    {
        gla += ShemuX86GetSegBase(Context, Operand->Info.Memory.Seg);

        if (Context->Arch.X86.Mode != ND_CODE_64)
        {
            // Truncate to 32 bit outside 64 bit.
            gla &= 0xFFFFFFFF;
        }
    }

    return gla;
}



//
// ShemuX86GetOperandValue
//
static SHEMU_STATUS
ShemuX86GetOperandValue(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Operand,
    SHEMU_VALUE *Value
    )
{
    SHEMU_STATUS status;
    PND_OPERAND op = &Context->Arch.X86.Instruction.Operands[Operand];

    Value->Size = op->Size;
    if (Value->Size > sizeof(Value->Value))
    {
        return SHEMU_ABORT_OPERAND_NOT_SUPPORTED;
    }

    if (op->Type == ND_OP_REG)
    {
        switch (op->Info.Register.Type)
        {
        case ND_REG_GPR:
            Value->Value.Qwords[0] = ShemuX86GetGprValue(Context, op->Info.Register.Reg, op->Size, 
                                                         op->Info.Register.IsHigh8);
            break;

        case ND_REG_SEG:
            Value->Value.Qwords[0] = ShemuX86GetSegValue(Context, op->Info.Register.Reg);
            break;

        case ND_REG_MMX:
            Value->Value.Qwords[0] = Context->Arch.X86.MmxRegisters[op->Info.Register.Reg];
            break;

        case ND_REG_SSE:
            shemu_memcpy(Value->Value.Bytes,
                         &Context->Arch.X86.SseRegisters[op->Info.Register.Reg], 
                         op->Size);
            break;

        case ND_REG_RIP:
            Value->Value.Qwords[0] = ND_TRIM(Value->Size, Context->Arch.X86.Registers.RegRip);
            break;

        case ND_REG_FLG:
            Value->Value.Qwords[0] = Context->Arch.X86.Registers.RegFlags;
            break;

        case ND_REG_CR:
            switch (op->Info.Register.Reg)
            {
            case NDR_CR0:
                Value->Value.Qwords[0] = Context->Arch.X86.Registers.RegCr0;
                break;
            case NDR_CR2:
                Value->Value.Qwords[0] = Context->Arch.X86.Registers.RegCr2;
                break;
            case NDR_CR3:
                Value->Value.Qwords[0] = Context->Arch.X86.Registers.RegCr3;
                break;
            case NDR_CR4:
                Value->Value.Qwords[0] = Context->Arch.X86.Registers.RegCr4;
                break;
            case NDR_CR8:
                Value->Value.Qwords[0] = Context->Arch.X86.Registers.RegCr8;
                break;
            default:
                return SHEMU_ABORT_REGISTER_NOT_SUPPORTED;
            }
            break;
            

        default:
            return SHEMU_ABORT_REGISTER_NOT_SUPPORTED;
        }
    }
    else if (op->Type == ND_OP_MEM)
    {
        ND_UINT64 gla = ShemuX86ComputeLinearAddress(Context, op);
        ND_UINT32 offsetPeb;
        ND_UINT8 seg;

        if (op->Info.Memory.IsAG)
        {
            // Address generation instruction, the result is the linear address itself.
            Value->Value.Qwords[0] = gla;
            goto done_gla;
        }

        if (Context->Arch.X86.Ring == 3)
        {
            // User-mode TIB offset that contains the PEB address.
            offsetPeb = Context->Arch.X86.Mode == ND_CODE_32 ? 0x30 : 0x60;
            seg = Context->Arch.X86.Mode == ND_CODE_32 ? ND_PREFIX_G2_SEG_FS : ND_PREFIX_G2_SEG_GS;
        }
        else
        {
            // Kernel-mode KPCR offset that contains the current KTHREAD address.
            offsetPeb = Context->Arch.X86.Mode == ND_CODE_32 ? 0x124 : 0x188;
            seg = Context->Arch.X86.Mode == ND_CODE_32 ? ND_PREFIX_G2_SEG_FS : ND_PREFIX_G2_SEG_GS;
        }

        // Check if this is a TIB/PCR access. Make sure the FS/GS register is used for the access, in order to avoid
        // FALSE positives where legitimate code accesses a linear TIB directly.
        // Note that this covers accesses to the PEB field inside the TIB.
        if (Context->Arch.X86.Ring == 3)
        {
            if (Context->Arch.X86.Instruction.Seg == seg &&
                gla == Context->TibBase + offsetPeb)
            {
                Context->Flags |= SHEMU_FLAG_TIB_ACCESS_PEB;
            }
        }

        // Note that this covers accesses to the Wow32Reserved in Wow64 mode. That field can be used to issue
        // syscalls.
        if (gla == Context->TibBase + 0xC0 && Context->Arch.X86.Instruction.Seg == seg && 
            Context->Arch.X86.Mode == ND_CODE_32)
        {
            Context->Flags |= SHEMU_FLAG_TIB_ACCESS_WOW32;
        }

        // Check for accesses inside the KUSER_SHARED_DATA (SharedUserData). This page contains some
        // global system information, it may host shellcodes, and is hard-coded at this address. We
        // only consider accesses to some SUD fields, in order to avoid false-positives.
        if ((gla >= 0x7FFE02D4 && gla < 0x7FFE02D5) || // KdDebuggerEnabled 
            (gla >= 0x7FFE0308 && gla < 0x7FFE1310) || // SystemCall
            (gla >= 0x7FFE0330 && gla < 0x7FFE1338))   // Cookie
        {
            Context->Flags |= SHEMU_FLAG_SUD_ACCESS;
        }

        // Check if we are reading a previously saved RIP. Ignore RET instructions, which naturally access the 
        // saved RIP.
        if (Context->Arch.X86.Instruction.Category != ND_CAT_RET &&
            ShemuIsStackPtr(Context, gla, op->Size) &&
            ShemuStackBmpStateCheck(Context, gla - Context->StackBase, op->Size, STACK_BYTE_RIP))
        {
            PND_OPERAND dst = ND_NULL;

            for (ND_UINT32 i = 0; i < Context->Arch.X86.Instruction.OperandsCount; i++)
            {
                // We fetch the first destination, if any. We ignore the current operand.
                if ((i != Operand) && (Context->Arch.X86.Instruction.Operands[i].Access.Access & ND_ACCESS_ANY_WRITE))
                {
                    dst = &Context->Arch.X86.Instruction.Operands[i];
                    break;
                }
            }

            // We consider a LOAD_RIP detection only if the RIP value is loaded in:
            // - Another memory location;
            // - A register, which is not (excluding bank register access, such as PPOPA):
            //      - A segment register;
            //      - The RIP register;
            //      - The Flags register;
            // Loads which are propagated to other operand types (RIP, segment register, etc.) or which are not 
            // propagated at all (RMW instructions) are not considered, since they don't store the RIP anywehere.
            if ((dst != ND_NULL) &&
                ((dst->Type == ND_OP_BANK) ||
                 (dst->Type == ND_OP_MEM) ||
                 (dst->Type == ND_OP_REG && 
                     dst->Info.Register.Type != ND_REG_SEG && 
                     dst->Info.Register.Type != ND_REG_RIP &&
                     dst->Info.Register.Type != ND_REG_FLG)))
            {
                Context->Flags |= SHEMU_FLAG_LOAD_RIP;
            }
        }

        // Get the memory value.
        status = ShemuMemLoad(Context, gla, Value->Size, Value->Value.Bytes);
        if (SHEMU_SUCCESS != status)
        {
            return status;
        }

        if (Context->Options & SHEMU_OPT_TRACE_MEMORY)
        {
            ShemuDisplayMemValue(Context, gla, Value->Size, Value->Value.Bytes, ND_TRUE);
        }

        // If this is a stack access, we need to update the stack pointer.
        if (op->Info.Memory.IsStack)
        {
            ND_UINT64 regval = ShemuX86GetGprValue(Context, NDR_RSP, (2 << Context->Arch.X86.Instruction.DefStack), ND_FALSE);

            regval += op->Size;

            ShemuX86SetGprValue(Context, NDR_RSP, (2 << Context->Arch.X86.Instruction.DefStack), regval, ND_FALSE);
        }

        // If this is a string operation, make sure we update RSI/RDI.
        if (op->Info.Memory.IsString)
        {
            ND_UINT64 regval = ShemuX86GetGprValue(Context, op->Info.Memory.Base, op->Info.Memory.BaseSize, ND_FALSE);

            regval = GET_FLAG(Context, NDR_RFLAG_DF) ? regval - op->Size : regval + op->Size;

            ShemuX86SetGprValue(Context, op->Info.Memory.Base, op->Info.Memory.BaseSize, regval, ND_FALSE);
        }

done_gla:;
    }
    else if (op->Type == ND_OP_IMM)
    {
        Value->Value.Qwords[0] = op->Info.Immediate.Imm;
    }
    else if (op->Type == ND_OP_CONST)
    {
        Value->Value.Qwords[0] = op->Info.Constant.Const;
    }
    else if (op->Type == ND_OP_OFFS)
    {
        Value->Value.Qwords[0] = op->Info.RelativeOffset.Rel;
    }
    else
    {
        return SHEMU_ABORT_OPERAND_NOT_SUPPORTED;
    }

    return SHEMU_SUCCESS;
}


//
// ShemuX86SetOperandValue
//
static SHEMU_STATUS
ShemuX86SetOperandValue(
    SHEMU_CONTEXT *Context,
    ND_UINT32 Operand,
    SHEMU_VALUE *Value
    )
{
    SHEMU_STATUS status;
    PND_OPERAND op = &Context->Arch.X86.Instruction.Operands[Operand];

    // If a stack address is being loaded, check if it points to a string built on the stack.
    if (ShemuIsStackPtr(Context, Value->Value.Qwords[0], Context->StrThreshold))
    {
        ND_UINT64 sptr = Value->Value.Qwords[0];
        ND_UINT32 i, stckstrlen = 0;

        // Check if a string was saved on the stack. Typically used by shellcodes like this:
        // PUSH str0
        // PUSH str1
        // ...
        // PUSH strn
        // Other variants may exist, but all we care about are stores on the stack, and all are checked.

        for (i = 0; i < Context->StrThreshold; i++)
        {
            unsigned char c;

            status = ShemuMemLoad(Context, sptr + i, 1, &c);
            if (SHEMU_SUCCESS != status)
            {
                break;
            }

            if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') ||
                c == '\\' || c == '/' || c == ':' || c == ' ')
            {
                stckstrlen++;
            }
            else
            {
                break;
            }
        }

        if (stckstrlen >= Context->StrThreshold)
        {
            Context->Flags |= SHEMU_FLAG_STACK_STR;
        }
    }


    if (op->Type == ND_OP_REG)
    {
        switch (op->Info.Register.Type)
        {
        case ND_REG_GPR:
            if (Context->Arch.X86.Instruction.Instruction == ND_INS_XCHG &&
                op->Info.Register.Reg == NDR_RSP &&
                ShemuX86IsAddressAligned(Context, Value->Value.Qwords[0]))
            {
                // Conditions for a STACK_PIVOT detection:
                // 1. The instruction is XCHG
                // 2. The value loaded in the RSP register is naturally aligned
                // 3. The value points either inside the shellcode or the stack area, and at least 64 bytes are valid
                if (ShemuIsShellcodePtr(Context, Value->Value.Qwords[0], 64) ||
                    ShemuIsStackPtr(Context, Value->Value.Qwords[0], 64))
                {
                    Context->Flags |= SHEMU_FLAG_STACK_PIVOT;
                }
            }

            ShemuX86SetGprValue(Context, op->Info.Register.Reg, op->Size, Value->Value.Qwords[0], 
                             op->Info.Register.IsHigh8);
            break;

        case ND_REG_SEG:
            if (!ShemuX86IsSelectorValid(Context, op->Info.Register.Reg, Value->Value.Words[0]))
            {
                return SHEMU_ABORT_INVALID_SELECTOR;
            }
            ShemuX86SetSegValue(Context, op->Info.Register.Reg, Value->Value.Words[0]);
            break;

        case ND_REG_MMX:
            Context->Arch.X86.MmxRegisters[op->Info.Register.Reg] = Value->Value.Qwords[0];
            // Only log these when they're written.
            if (Context->Options & SHEMU_OPT_TRACE_EMULATION)
            {
                shemu_printf(Context, "        MM%d = 0x%016llx\n", op->Info.Register.Reg, Value->Value.Qwords[0]);
            }
            break;

        case ND_REG_SSE:
            if (Context->Arch.X86.Instruction.EncMode != ND_ENCM_LEGACY)
            {
                // Zero the entire register first, if we have a VEX/EVEX encoded instruction.
                nd_memzero(&Context->Arch.X86.SseRegisters[op->Info.Register.Reg], ND_MAX_REGISTER_SIZE);
            }
            else
            {
                // Zero upper bits in the 128 bits register, if operand size is less than 16 bytes.
                // Upper bits in the YMM/ZMM register are preserved.
                nd_memzero(&Context->Arch.X86.SseRegisters[op->Info.Register.Reg], 16);
            }
            // Copy the value.
            shemu_memcpy(&Context->Arch.X86.SseRegisters[op->Info.Register.Reg],
                         Value->Value.Bytes, 
                         op->Size);
            // Only log these when they're written.
            if (Context->Options & SHEMU_OPT_TRACE_EMULATION)
            {
                shemu_printf(Context,
                             "        %cMM%d (HI_32) = 0x%016llx%016llx%016llx%016llx\n",
                             op->Size == 16 ? 'X' : op->Size == 32 ? 'Y' : 'Z', op->Info.Register.Reg,
                             Value->Value.Qwords[7], Value->Value.Qwords[6], 
                             Value->Value.Qwords[5], Value->Value.Qwords[4]);
                shemu_printf(Context,
                             "        %cMM%d (LO_32) = 0x%016llx%016llx%016llx%016llx\n",
                             op->Size == 16 ? 'X' : op->Size == 32 ? 'Y' : 'Z', op->Info.Register.Reg,
                             Value->Value.Qwords[3], Value->Value.Qwords[2], 
                             Value->Value.Qwords[1], Value->Value.Qwords[0]);
            }
            break;

        case ND_REG_RIP:
            Context->Arch.X86.Registers.RegRip = ND_TRIM(Value->Size, Value->Value.Qwords[0]);
            break;

        case ND_REG_FLG:
            if (op->Size == 2)
            {
                *((ND_UINT16*)&Context->Arch.X86.Registers.RegFlags) = Value->Value.Words[0];
            }
            else
            {
                Context->Arch.X86.Registers.RegFlags = Value->Value.Qwords[0];
            }
            // Handle reserved bits.
            Context->Arch.X86.Registers.RegFlags |= (1ULL << 1);
            Context->Arch.X86.Registers.RegFlags &= ~((1ULL << 3) | (1ULL << 5) | (1ULL << 15));
            Context->Arch.X86.Registers.RegFlags &= 0x3FFFFF;
            break;

        case ND_REG_CR:
            switch (op->Info.Register.Reg)
            {
            case NDR_CR0:
                Context->Arch.X86.Registers.RegCr0 = Value->Value.Qwords[0];
                break;
            case NDR_CR2:
                Context->Arch.X86.Registers.RegCr2 = Value->Value.Qwords[0];
                break;
            case NDR_CR3:
                Context->Arch.X86.Registers.RegCr3 = Value->Value.Qwords[0];
                break;
            case NDR_CR4:
                Context->Arch.X86.Registers.RegCr4 = Value->Value.Qwords[0];
                break;
            case NDR_CR8:
                Context->Arch.X86.Registers.RegCr8 = Value->Value.Qwords[0];
                break;
            default:
                return SHEMU_ABORT_REGISTER_NOT_SUPPORTED;
            }
            break;

        default:
            return SHEMU_ABORT_REGISTER_NOT_SUPPORTED;
        }
    }
    else if (op->Type == ND_OP_MEM)
    {
        // Compute the GLA.
        ND_UINT64 gla = ShemuX86ComputeLinearAddress(Context, op);

        // Handle self-write. We store a 1 for each written byte inside the shellcode space. Once the modified bytes
        // are executed, we can trigger the self-write detection.
        if (ShemuIsShellcodePtr(Context, gla, op->Size))
        {
            ShemuShellBmpStateSet(Context, gla - Context->ShellcodeBase, op->Size, SHELL_BYTE_DIRTY);
        }

        // Handle RIP save on the stack.
        if (ShemuIsStackPtr(Context, gla, MAX(op->Size, Context->Arch.X86.Instruction.WordLength)))
        {
            // Note: only Context->Arch.X86.Instruction.WordLength bits are flagged as RIP, as that is the RIP size.
            if (Context->Arch.X86.Instruction.Instruction == ND_INS_CALLNR ||
                Context->Arch.X86.Instruction.Instruction == ND_INS_CALLNI)
            {
                ShemuStackBmpStateSet(Context, gla - Context->StackBase, 
                                      Context->Arch.X86.Instruction.WordLength, STACK_BYTE_RIP);
            }
            else if (Context->Arch.X86.Instruction.Instruction == ND_INS_FNSTENV)
            {
                // OK: op->Size will be the FPU state size (28 bytes); we only emulate 32 & 64 bit forms, the RIP is 
                // always 4 bytes.
                ShemuStackBmpStateSet(Context, (gla + 0xC) - Context->StackBase, 
                                      MIN(Context->Arch.X86.Instruction.WordLength, 4), STACK_BYTE_RIP);
            }
            else if (Context->Arch.X86.Instruction.Instruction == ND_INS_FXSAVE || 
                Context->Arch.X86.Instruction.Instruction == ND_INS_FXSAVE64)
            {
                // OK: op->Size will be the FXSAVE size (512 bytes).
                ShemuStackBmpStateSet(Context, (gla + 0x8) - Context->StackBase, 
                                      Context->Arch.X86.Instruction.WordLength, STACK_BYTE_RIP);
            }
            else
            {
                // Something is written on a previously saved RIP; reset it.
                ShemuStackBmpStateClear(Context, gla - Context->StackBase, op->Size, STACK_BYTE_RIP);
            }
        }

        // Set the value.
        status = ShemuMemStore(Context, gla, MIN(op->Size, Value->Size), Value->Value.Bytes);
        if (SHEMU_SUCCESS != status)
        {
            return status;
        }

        if (Context->Options & SHEMU_OPT_TRACE_MEMORY)
        {
            ShemuDisplayMemValue(Context, gla, Value->Size, Value->Value.Bytes, ND_FALSE);
        }

        // If this is a stack access, we need to update the stack pointer.
        if (op->Info.Memory.IsStack)
        {
            ND_UINT64 regval = ShemuX86GetGprValue(Context, NDR_RSP, (2 << Context->Arch.X86.Instruction.DefStack), ND_FALSE);

            regval -= op->Size;

            ShemuX86SetGprValue(Context, NDR_RSP, (2 << Context->Arch.X86.Instruction.DefStack), regval, ND_FALSE);
        }

        // If this is a string operation, make sure we update RSI/RDI.
        if (op->Info.Memory.IsString)
        {
            ND_UINT64 regval = ShemuX86GetGprValue(Context, op->Info.Memory.Base, op->Info.Memory.BaseSize, ND_FALSE);

            regval = GET_FLAG(Context, NDR_RFLAG_DF) ? regval - op->Size : regval + op->Size;

            ShemuX86SetGprValue(Context, op->Info.Memory.Base, op->Info.Memory.BaseSize, regval, ND_FALSE);
        }
    }
    else
    {
        return SHEMU_ABORT_OPERAND_NOT_SUPPORTED;
    }

    return SHEMU_SUCCESS;
}


//
// ShemuX86Multiply64Unsigned
//
static void
ShemuX86Multiply64Unsigned(
    SHEMU_VALUE *Operand1,
    SHEMU_VALUE *Operand2,
    SHEMU_VALUE *Result
    )
{
    ND_UINT64 p0, p1, p2, p3, p4;

    // Multiply the 4 32-bit parts into 4 partial products.
    p0 = (ND_UINT64)Operand1->Value.Dwords[0] * (ND_UINT64)Operand2->Value.Dwords[0];
    p1 = (ND_UINT64)Operand1->Value.Dwords[0] * (ND_UINT64)Operand2->Value.Dwords[1];
    p2 = (ND_UINT64)Operand1->Value.Dwords[1] * (ND_UINT64)Operand2->Value.Dwords[0];
    p3 = (ND_UINT64)Operand1->Value.Dwords[1] * (ND_UINT64)Operand2->Value.Dwords[1];
    p4 = (((p0 >> 32) + (p1 & 0xFFFFFFFF) + (p2 & 0xFFFFFFFF)) >> 32) & 0xFFFFFFFF;

    // Fill in the final result (low & high 64-bit parts).
    Result->Value.Qwords[0] = p0 + (p1 << 32) + (p2 << 32);
    Result->Value.Qwords[1] = p3 + (p1 >> 32) + (p2 >> 32) + p4;
}


//
// ShemuX86Multiply64Signed
//
static void
ShemuX86Multiply64Signed(
    SHEMU_VALUE *Operand1,
    SHEMU_VALUE *Operand2,
    SHEMU_VALUE *Result
    )
{
    ShemuX86Multiply64Unsigned(Operand1, Operand2, Result);

    // Negate, if needed.
    if (ND_GET_SIGN(8, Operand1->Value.Qwords[0]))
    {
        Result->Value.Qwords[1] -= Operand2->Value.Qwords[0];
    }

    // Negate, if needed.
    if (ND_GET_SIGN(8, Operand2->Value.Qwords[0]))
    {
        Result->Value.Qwords[1] -= Operand1->Value.Qwords[0];
    }
}


//
// ShemuX86CheckDiv
//
static ND_BOOL
ShemuX86CheckDiv(
    ND_UINT64 Divident,
    ND_UINT64 Divider,
    ND_UINT8 Size            // The size of the Source (Divider). The Divident is twice as large.
    )
{
    // Returns ND_TRUE if all checks are OK, and Divident / Divider will not cause #DE.

    if (Divider == 0)
    {
        // Division by zero.
        return ND_FALSE;
    }

    // If the result won't fit in the destination, a #DE would be generated.
    switch (Size)
    {
    case 1:
        if (((Divident >> 8) & 0xFF) >= Divider)
        {
            return ND_FALSE;
        }
        break;

    case 2:
        if (((Divident >> 16) & 0xFFFF) >= Divider)
        {
            return ND_FALSE;
        }
        break;

    case 4:
        if (((Divident >> 32) & 0xFFFFFFFF) >= Divider)
        {
            return ND_FALSE;
        }
        break;

    default:
        // 64 bit source division is not supported.
        return ND_FALSE;
    }

    return ND_TRUE;
}


//
// ShemuX86CheckIdiv
//
static ND_BOOL
ShemuX86CheckIdiv(
    ND_SINT64 Divident,
    ND_SINT64 Divider,
    ND_UINT8 Size            // The size of the Source (Divider).
    )
{
    ND_BOOL neg1, neg2;
    ND_UINT64 quotient, max;

    neg1 = Divident < 0;
    neg2 = Divider < 0;

    if (neg1)
    {
        Divident = -Divident;
    }

    if (neg2)
    {
        Divider = -Divider;
    }

    // Do checks when dividing positive values.
    if (!ShemuX86CheckDiv(Divident, Divider, Size))
    {
        return ND_FALSE;
    }

    // Get the positive quotient.
    quotient = (ND_UINT64)Divident / (ND_UINT64)Divider;

    max = (Size == 1) ? 0x80 : (Size == 2) ? 0x8000 : (Size == 4) ? 0x80000000 : 0x8000000000000000;

    if (neg1 ^ neg2)
    {
        // The Divident and the Divider have different signs, the quotient must be negative. If it's positive => #DE.
        if (ND_GET_SIGN(Size, quotient) && quotient != max)
        {
            return ND_FALSE;
        }
    }
    else
    {
        // Both the Divident and the Divider are positive/negative, so a positive result must be produced. If it's 
        // negative => #DE.
        if (ND_GET_SIGN(Size, quotient))
        {
            return ND_FALSE;
        }
    }

    return ND_TRUE;
}


//
// ShemuX86CountZeroBits
//
static ND_UINT8
ShemuX86CountZeroBits(
    ND_UINT64 Value,
    ND_OPERAND_SIZE Size,
    ND_BOOL Forward
    )
{
    ND_UINT8 cnt = 0;

    if (Forward)
    {
        for (ND_UINT32 i = 0; i < Size * 8 && ND_GET_BIT(i, Value) == 0; i++, cnt++);
    }
    else
    {
        for (ND_SINT32 i = Size * 8 - 1; i >= 0 && ND_GET_BIT(i, Value) == 0; i--, cnt++);
    }

    return cnt;
}


//
// ShemuX86PrintContext
//
#ifndef BDDISASM_NO_FORMAT
static void
ShemuX86PrintContext(
    SHEMU_CONTEXT *Context
    )
{
    char text[ND_MIN_BUF_SIZE] = { 0 };
    char ibytes[ND_MAX_INSTRUCTION_LENGTH * 2 + 2] = { 0 };

    NdToText(&Context->Arch.X86.Instruction, Context->Arch.X86.Registers.RegRip, ND_MIN_BUF_SIZE, text);

    shemu_printf(Context, "        RAX = 0x%016llx RCX = 0x%016llx RDX = 0x%016llx RBX = 0x%016llx\n",
        Context->Arch.X86.Registers.RegRax, Context->Arch.X86.Registers.RegRcx, 
        Context->Arch.X86.Registers.RegRdx, Context->Arch.X86.Registers.RegRbx);
    shemu_printf(Context, "        RSP = 0x%016llx RBP = 0x%016llx RSI = 0x%016llx RDI = 0x%016llx\n",
        Context->Arch.X86.Registers.RegRsp, Context->Arch.X86.Registers.RegRbp, 
        Context->Arch.X86.Registers.RegRsi, Context->Arch.X86.Registers.RegRdi);
    shemu_printf(Context, "        R8  = 0x%016llx R9  = 0x%016llx R10 = 0x%016llx R11 = 0x%016llx\n",
        Context->Arch.X86.Registers.RegR8, Context->Arch.X86.Registers.RegR9, 
        Context->Arch.X86.Registers.RegR10, Context->Arch.X86.Registers.RegR11);
    shemu_printf(Context, "        R12 = 0x%016llx R13 = 0x%016llx R14 = 0x%016llx R15 = 0x%016llx\n",
        Context->Arch.X86.Registers.RegR12, Context->Arch.X86.Registers.RegR13, 
        Context->Arch.X86.Registers.RegR14, Context->Arch.X86.Registers.RegR15);

    if (Context->Options & SHEMU_OPT_SUPPORT_APX)
    {
        shemu_printf(Context, "        R16 = 0x%016llx R17 = 0x%016llx R18 = 0x%016llx R19 = 0x%016llx\n",
            Context->Arch.X86.Registers.RegR16, Context->Arch.X86.Registers.RegR17,
            Context->Arch.X86.Registers.RegR18, Context->Arch.X86.Registers.RegR19);
        shemu_printf(Context, "        R20 = 0x%016llx R21 = 0x%016llx R22 = 0x%016llx R23 = 0x%016llx\n",
            Context->Arch.X86.Registers.RegR20, Context->Arch.X86.Registers.RegR21,
            Context->Arch.X86.Registers.RegR22, Context->Arch.X86.Registers.RegR23);
        shemu_printf(Context, "        R24 = 0x%016llx R25 = 0x%016llx R26 = 0x%016llx R27 = 0x%016llx\n",
            Context->Arch.X86.Registers.RegR24, Context->Arch.X86.Registers.RegR25,
            Context->Arch.X86.Registers.RegR26, Context->Arch.X86.Registers.RegR27);
        shemu_printf(Context, "        R28 = 0x%016llx R29 = 0x%016llx R30 = 0x%016llx R31 = 0x%016llx\n",
            Context->Arch.X86.Registers.RegR28, Context->Arch.X86.Registers.RegR29,
            Context->Arch.X86.Registers.RegR30, Context->Arch.X86.Registers.RegR31);
    }

    shemu_printf(Context, "        RIP = 0x%016llx RFLAGS = 0x%016llx ", 
        Context->Arch.X86.Registers.RegRip, Context->Arch.X86.Registers.RegFlags);
    shemu_printf(Context, "  CF:%d PF:%d AF:%d ZF:%d SF:%d TF:%d IF:%d DF:%d OF:%d\n",
        GET_FLAG(Context, NDR_RFLAG_CF),
        GET_FLAG(Context, NDR_RFLAG_PF),
        GET_FLAG(Context, NDR_RFLAG_AF),
        GET_FLAG(Context, NDR_RFLAG_ZF),
        GET_FLAG(Context, NDR_RFLAG_SF),
        GET_FLAG(Context, NDR_RFLAG_TF),
        GET_FLAG(Context, NDR_RFLAG_IF),
        GET_FLAG(Context, NDR_RFLAG_DF),
        GET_FLAG(Context, NDR_RFLAG_OF));

    ShemuHexlify(Context->Arch.X86.Instruction.InstructionBytes, Context->Arch.X86.Instruction.Length, 
        ibytes, sizeof(ibytes));
    
    shemu_printf(Context, "IP: 0x%016llx  %-30s  %s\n", Context->Arch.X86.Registers.RegRip, ibytes, text);
    
}
#else
#define ShemuX86PrintContext(Context)
#endif // !BDDISASM_NO_FORMAT 


//
// ShemuX86Emulate
//
SHEMU_STATUS
ShemuX86Emulate(
    SHEMU_CONTEXT *Context
    )
{
    ND_CONTEXT decodeCtx = { 0 };
    SHEMU_VALUE res, dst, src;
    ND_BOOL stop = ND_FALSE, cf, sled = ND_TRUE, taken = ND_FALSE;
    ND_UINT16 cs = 0;
    ND_UINT64 tsc = 0x1248fe7a5c30;

    if (ND_NULL == Context)
    {
        return SHEMU_ABORT_INVALID_PARAMETER;
    }

    if (ND_NULL == Context->Shellcode)
    {
        return SHEMU_ABORT_INVALID_PARAMETER;
    }

    if (ND_NULL == Context->Stack)
    {
        return SHEMU_ABORT_INVALID_PARAMETER;
    }

    if (ND_NULL == Context->Intbuf)
    {
        return SHEMU_ABORT_INVALID_PARAMETER;
    }

    if (0 == (Context->Options & SHEMU_OPT_DIRECT_MAPPED_SHELL))
    {
        if (Context->StackSize + Context->ShellcodeSize > Context->IntbufSize)
        {
            return SHEMU_ABORT_INVALID_PARAMETER;
        }
    }
    else
    {
        if (Context->StackSize > Context->IntbufSize)
        {
            return SHEMU_ABORT_INVALID_PARAMETER;
        }

        if (Context->AccessShellcode == ND_NULL)
        {
            // The AccessShellcode callback is mandatory when using the SHEMU_OPT_DIRECT_MAPPED_SHELL option.
            return SHEMU_ABORT_INVALID_PARAMETER;
        }

        // It is unsafe to allow self-writes to proceed, so this options is forced when using 
        // SHEMU_OPT_DIRECT_MAPPED_SHELL emulation.
        Context->Options |= SHEMU_OPT_BYPASS_SELF_WRITES;
    }

    if (Context->ArchType != SHEMU_ARCH_TYPE_X86)
    {
        return SHEMU_ABORT_INVALID_PARAMETER;
    }

    if (Context->NopThreshold == 0)
    {
        Context->NopThreshold = SHEMU_DEFAULT_NOP_THRESHOLD;
    }

    if (Context->StrThreshold == 0)
    {
        Context->StrThreshold = SHEMU_DEFAULT_STR_THRESHOLD;
    }

    decodeCtx.DefCode = Context->Arch.X86.Mode;
    decodeCtx.DefData = Context->Arch.X86.Mode;
    decodeCtx.DefStack = Context->Arch.X86.Mode;
    decodeCtx.VendMode = ND_VEND_ANY;
    decodeCtx.Options = 0;
    decodeCtx.FeatMode = 0;

    // Enable APX support.
    if (Context->Options & SHEMU_OPT_SUPPORT_APX)
    {
        decodeCtx.FeatMode |= ND_FEAT_APX;
    }

    while (Context->InstructionsCount++ < Context->MaxInstructionsCount)
    {
        SHEMU_STATUS shstatus;
        NDSTATUS ndstatus;
        ND_UINT64 rip;          // Offset, not actual linear address.
        ND_UINT32 i, ilen;
        ND_UINT8 ibytes[16];
        ND_BOOL hasNd = ND_FALSE;

        tsc++;

        // Reset all the operands to 0.
        nd_memzero(&res, sizeof(res));
        nd_memzero(&dst, sizeof(dst));
        nd_memzero(&src, sizeof(src));

        // The stop flag has been set, this means we've reached a valid instruction, but that instruction cannot be
        // emulated (for example, SYSCALL, INT, system instructions, etc).
        if (stop)
        {
            return SHEMU_ABORT_CANT_EMULATE;
        }

        // If we already have a detection and we wish to stop on detections, do so now.
        if ((0 != Context->Flags) && (0 != (Context->Options & SHEMU_OPT_STOP_ON_EXPLOIT)))
        {
            return SHEMU_ABORT_SHELLCODE_DETECTED;
        }

        // Make sure the RIP is pointing in the right area. We test only 1 byte - the decoder will make sure it can
        // access as many bytes as needed and return error in case it can't.
        if (!ShemuIsShellcodePtr(Context, Context->Arch.X86.Registers.RegRip, 1))
        {
            return SHEMU_ABORT_RIP_OUTSIDE;
        }

        // Get the offset inside the shellcode buffer.
        rip = Context->Arch.X86.Registers.RegRip - Context->ShellcodeBase;

        // Maximum number of bytes we can fetch. No more than 16 bytes.
        ilen = (ND_UINT32)MIN(Context->ShellcodeSize - rip, sizeof(ibytes));

        // Fetch instruction bytes.
        shstatus = ShemuMemFetch(Context, Context->Arch.X86.Registers.RegRip, ilen, ibytes);
        if (SHEMU_SUCCESS != shstatus)
        {
            return shstatus;
        }

        // Decode the next instruction.
        ndstatus = NdDecodeWithContext(&Context->Arch.X86.Instruction, ibytes, ilen, &decodeCtx);
        if (!ND_SUCCESS(ndstatus))
        {
            if (ND_STATUS_BUFFER_TOO_SMALL == ndstatus)
            {
                return SHEMU_ABORT_RIP_OUTSIDE;
            }
            else
            {
                return SHEMU_ABORT_DECODE_ERROR;
            }
        }

        // Paranoid check...
        if (!ShemuIsShellcodePtr(Context, Context->Arch.X86.Registers.RegRip, Context->Arch.X86.Instruction.Length))
        {
            return SHEMU_ABORT_RIP_OUTSIDE;
        }

        // Check if this is a new, unique, address being executed.
        if (!ShemuShellBmpStateCheck(Context, rip, 1, SHELL_BYTE_FETCHED))
        {
            // If SHEMU_OPT_DIRECT_MAPPED_SHELL is not used, this will be incremented for each instruction.
            Context->UniqueCount++;

            // Note: The first instruction byte is marked using SHELL_BYTE_FETCHED, and subsequent bytes using 
            // SHELL_BYTE_IBYTES. For example, if we have the instruction "33C0 (XOR eax, eax)" at address 0x1000,
            // the flags will be set  as follows:
            // Shellcode state for 0x1000: SHELL_BYTE_FETCHED (0x33, the opcode)
            // Shellcode state for 0x1001: SHELL_BYTE_IBYTES  (0xC0, subsequent bytes)
            // Normally, these two flags should never be set together. 

            // Indicate that we've fetched an instruction from this address.
            ShemuShellBmpStateSet(Context, rip, 1, SHELL_BYTE_FETCHED);

            // Indicate that subsequent bytes are part of a fetched & emulated instruction.
            ShemuShellBmpStateSet(Context, rip + 1, Context->Arch.X86.Instruction.Length - 1, SHELL_BYTE_IBYTES);
        }

        // Check if we just fetched an instruction from a previously written area, to raise self-write alert.
        if (ShemuShellBmpStateCheck(Context, rip, Context->Arch.X86.Instruction.Length, SHELL_BYTE_DIRTY))
        {
            Context->Flags |= SHEMU_FLAG_WRITE_SELF;
        }

        // Dump the context.
        if (Context->Options & SHEMU_OPT_TRACE_EMULATION)
        {
            ShemuX86PrintContext(Context);
        }

        // The RIP is incremented BEFORE actually emulating the instruction. This is what the CPU does as well.
        Context->Arch.X86.Registers.RegRip += Context->Arch.X86.Instruction.Length;

        // Bail out early if we encounter a privileged instruction.
        if (Context->Arch.X86.Instruction.ValidModes.Ring3 == 0 && Context->Arch.X86.Ring == 3)
        {
            return SHEMU_ABORT_NO_PRIVILEGE;
        }

        // Some instruction types are unssuported.
        if (Context->Arch.X86.Instruction.Category == ND_CAT_IO ||
            Context->Arch.X86.Instruction.Category == ND_CAT_IOSTRINGOP)
        {
            return SHEMU_ABORT_INSTRUX_NOT_SUPPORTED;
        }

        // There are two important aspects of a NOP sled:
        // 1. The NOPs must be at the beginning of the code;
        // 2. The NOPs must be consecutive;
        // We will only count consecutive NOPs if they're at the beginning of the code.
        if (Context->Arch.X86.Instruction.Instruction != ND_INS_NOP)
        {
            sled = ND_FALSE;
        }

        // Count the total number of '00 00' (ADD [rax], al) instructions.
        if (Context->Arch.X86.Instruction.InstructionBytes[0] == 0 && 
            Context->Arch.X86.Instruction.InstructionBytes[1] == 0)
        {
            Context->NullCount++;
        }

        // FPU instructions are "pass-through", we just want to save the RIP, so we can emulate FNSTENV.
        if ((Context->Arch.X86.Instruction.IsaSet == ND_SET_X87) && 
            (Context->Arch.X86.Instruction.Instruction != ND_INS_FNSTENV))
        {
            PND_OPERAND pMemOp = ND_NULL;

            Context->Arch.X86.Registers.FpuRip = Context->Arch.X86.Registers.RegRip - Context->Arch.X86.Instruction.Length;

            // If the instruction uses a memory operand, validate it, and bail out if it points outside 
            // shellcode or stack memory.
            if (Context->Arch.X86.Instruction.OperandsCount >= 1 && 
                Context->Arch.X86.Instruction.Operands[0].Type == ND_OP_MEM)
            {
                pMemOp = &Context->Arch.X86.Instruction.Operands[0];
            }
            else if (Context->Arch.X86.Instruction.OperandsCount >= 2 && 
                Context->Arch.X86.Instruction.Operands[1].Type == ND_OP_MEM)
            {
                pMemOp = &Context->Arch.X86.Instruction.Operands[1];
            }

            if (ND_NULL != pMemOp)
            {
                ND_UINT64 gla = ShemuX86ComputeLinearAddress(Context, pMemOp);

                if (!ShemuIsShellcodePtr(Context, gla, pMemOp->Size) &&
                    !ShemuIsStackPtr(Context, gla, pMemOp->Size))
                {
                    stop = ND_TRUE;
                }
            }

            continue;
        }

        // This flag can only be set for APX instructions.
        hasNd = !!Context->Arch.X86.Instruction.HasNd;

        switch (Context->Arch.X86.Instruction.Instruction)
        {
        case ND_INS_FNSTENV:
            if (Context->Arch.X86.Instruction.EfOpMode != ND_OPSZ_16)
            {
                src.Size = Context->Arch.X86.Instruction.Operands[0].Size;
                src.Value.FpuEnvironment.FpuInstructionPointer = (ND_UINT32)Context->Arch.X86.Registers.FpuRip;
                SET_OP(Context, 0, &src);
            }
            break;

        case ND_INS_FXSAVE:
        case ND_INS_FXSAVE64:
            src.Size = MIN(Context->Arch.X86.Instruction.Operands[0].Size, sizeof(src.Value.XsaveArea));
            src.Value.XsaveArea.FpuRip = Context->Arch.X86.Registers.FpuRip;
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_MOV_CR:
            if (Context->Arch.X86.Ring != 0)
            {
                return SHEMU_ABORT_NO_PRIVILEGE;
            }

            // Fall through.

        case ND_INS_MOV:
        case ND_INS_MOVZX:
            GET_OP(Context, 1, &src);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_MOVSX:
        case ND_INS_MOVSXD:
            GET_OP(Context, 1, &src);
            GET_OP(Context, 0, &dst);
            dst.Value.Qwords[0] = ND_SIGN_EX(src.Size, src.Value.Qwords[0]);
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_CMOVcc:
            GET_OP(Context, 0, &dst);
            GET_OP(Context, 1, &src);
            if (ShemuX86EvalCondition(Context, Context->Arch.X86.Instruction.Condition))
            {
                SET_OP(Context, 0, &src);
            }
            else
            {
                // Write back the same value that was already present in destination. This has the side-effect of 
                // clearing the upper 32 bit in the 64 bit destination register while in long mode.
                SET_OP(Context, 0, &dst);
            }
            break;

        case ND_INS_SETcc:
            if (ShemuX86EvalCondition(Context, Context->Arch.X86.Instruction.Condition))
            {
                src.Size = Context->Arch.X86.Instruction.Operands[0].Size;
                src.Value.Qwords[0] = 1;
            }
            else
            {
                src.Size = Context->Arch.X86.Instruction.Operands[0].Size;
                src.Value.Qwords[0] = 0;
            }
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_XLATB:
            GET_OP(Context, 1, &src);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_XCHG:
            GET_OP(Context, 1, &src);
            GET_OP(Context, 0, &dst);
            SET_OP(Context, 1, &dst);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_XADD:
            GET_OP(Context, 1, &src);
            GET_OP(Context, 0, &dst);
            res.Size = dst.Size;
            res.Value.Qwords[0] = dst.Value.Qwords[0] + src.Value.Qwords[0];
            SET_FLAGS(Context, res, dst, src, FM_ADD);
            SET_OP(Context, 1, &dst);
            SET_OP(Context, 0, &res);
            break;

        case ND_INS_CMPXCHG:
            GET_OP(Context, 2, &src);
            GET_OP(Context, 0, &dst);

            res.Size = src.Size;
            // Note: The accumulator is compared with the destination, not the other way around.
            res.Value.Qwords[0] = src.Value.Qwords[0] - dst.Value.Qwords[0];

            SET_FLAGS(Context, res, src, dst, FM_SUB);

            if (src.Value.Qwords[0] == dst.Value.Qwords[0])
            {
                GET_OP(Context, 1, &src);
                SET_OP(Context, 0, &src);
            }
            else
            {
                SET_OP(Context, 2, &dst);
            }
            break;

        case ND_INS_ADD:
        case ND_INS_ADC:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            GET_OP(Context, hasNd ? 2 : 1, &src);

            res.Size = src.Size;
            res.Value.Qwords[0] = dst.Value.Qwords[0] + src.Value.Qwords[0];

            if (ND_INS_ADC == Context->Arch.X86.Instruction.Instruction)
            {
                res.Value.Qwords[0] += GET_FLAG(Context, NDR_RFLAG_CF);
            }

            SET_FLAGS(Context, res, dst, src, FM_ADD);
            SET_OP(Context, 0, &res);

            break;

        case ND_INS_SUB:
        case ND_INS_SBB:
        case ND_INS_CMP:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            GET_OP(Context, hasNd ? 2 : 1, &src);

            res.Size = src.Size;
            res.Value.Qwords[0] = dst.Value.Qwords[0] - src.Value.Qwords[0];

            if (ND_INS_SBB == Context->Arch.X86.Instruction.Instruction)
            {
                res.Value.Qwords[0] -= GET_FLAG(Context, NDR_RFLAG_CF);
            }

            SET_FLAGS(Context, res, dst, src, FM_SUB);

            if (ND_INS_CMP != Context->Arch.X86.Instruction.Instruction)
            {
                SET_OP(Context, 0, &res);
            }

            break;

        case ND_INS_INC:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            src.Size = dst.Size;
            src.Value.Qwords[0] = 1;
            res.Size = src.Size;
            res.Value.Qwords[0] = dst.Value.Qwords[0] + src.Value.Qwords[0];
            cf = GET_FLAG(Context, NDR_RFLAG_CF);
            SET_FLAGS(Context, res, dst, src, FM_ADD);
            SET_FLAG(Context, NDR_RFLAG_CF, cf);
            SET_OP(Context, 0, &res);
            break;

        case ND_INS_DEC:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            src.Size = dst.Size;
            src.Value.Qwords[0] = 1;
            res.Size = src.Size;
            res.Value.Qwords[0] = dst.Value.Qwords[0] - src.Value.Qwords[0];
            cf = GET_FLAG(Context, NDR_RFLAG_CF);
            SET_FLAGS(Context, res, dst, src, FM_SUB);
            SET_FLAG(Context, NDR_RFLAG_CF, cf);
            SET_OP(Context, 0, &res);
            break;

        case ND_INS_PUSH:
        case ND_INS_PUSHP:
        case ND_INS_PUSHF:
            GET_OP(Context, 0, &src);
            SET_OP(Context, 1, &src);
            break;

        case ND_INS_POP:
        case ND_INS_POPP:
        case ND_INS_POPF:
            GET_OP(Context, 1, &src);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_PUSH2:
        case ND_INS_PUSH2P:
            // APX extended EVEX can only be enabled in 64-bit mode, so it's safe to always assume 64-bit regs.
            src.Size = 16;
            src.Value.Qwords[0] = ShemuX86GetGprValue(Context, Context->Arch.X86.Instruction.Operands[1].Info.Register.Reg, 8, ND_FALSE);
            src.Value.Qwords[1] = ShemuX86GetGprValue(Context, Context->Arch.X86.Instruction.Operands[0].Info.Register.Reg, 8, ND_FALSE);
            SET_OP(Context, 2, &src);
            break;

        case ND_INS_POP2:
        case ND_INS_POP2P:
            // I know this looks weird, and you would expect a PUSH2 r1, r2 to have a complementary POP2 r1, r2, but
            // according to the current APX specification, this is not the case. The right pair for PUSH2 r1, r2 is, 
            // in fact, POP2 r2, r1 (the order of the popped operands must be reversed to match the PUSH2).
            GET_OP(Context, 2, &src);
            ShemuX86SetGprValue(Context, Context->Arch.X86.Instruction.Operands[0].Info.Register.Reg, 8, src.Value.Qwords[0], ND_FALSE);
            ShemuX86SetGprValue(Context, Context->Arch.X86.Instruction.Operands[1].Info.Register.Reg, 8, src.Value.Qwords[1], ND_FALSE);
            break;

        case ND_INS_PUSHA:
            src.Size = 16;
            src.Value.Words[7] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_AX, 2, ND_FALSE);
            src.Value.Words[6] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_CX, 2, ND_FALSE);
            src.Value.Words[5] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_DX, 2, ND_FALSE);
            src.Value.Words[4] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_BX, 2, ND_FALSE);
            src.Value.Words[3] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_SP, 2, ND_FALSE);
            src.Value.Words[2] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_BP, 2, ND_FALSE);
            src.Value.Words[1] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_SI, 2, ND_FALSE);
            src.Value.Words[0] = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_DI, 2, ND_FALSE);
            SET_OP(Context, 1, &src); // Operand 1 is the stack (destination).
            break;

        case ND_INS_PUSHAD:
            src.Size = 32;
            src.Value.Dwords[7] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_EAX, 4, ND_FALSE);
            src.Value.Dwords[6] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_ECX, 4, ND_FALSE);
            src.Value.Dwords[5] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_EDX, 4, ND_FALSE);
            src.Value.Dwords[4] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_EBX, 4, ND_FALSE);
            src.Value.Dwords[3] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_ESP, 4, ND_FALSE);
            src.Value.Dwords[2] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_EBP, 4, ND_FALSE);
            src.Value.Dwords[1] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_ESI, 4, ND_FALSE);
            src.Value.Dwords[0] = (ND_UINT32)ShemuX86GetGprValue(Context, NDR_EDI, 4, ND_FALSE);
            SET_OP(Context, 1, &src); // Operand 1 is the stack (destination).
            break;

        case ND_INS_POPA:
            GET_OP(Context, 1, &src); // Operand 1 is the stack (source).
            ShemuX86SetGprValue(Context, NDR_AX, 2, src.Value.Words[7], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_CX, 2, src.Value.Words[6], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_DX, 2, src.Value.Words[5], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_BX, 2, src.Value.Words[4], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_BP, 2, src.Value.Words[2], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_SI, 2, src.Value.Words[1], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_DI, 2, src.Value.Words[0], ND_FALSE);
            break;

        case ND_INS_POPAD:
            GET_OP(Context, 1, &src); // Operand 1 is the stack (source).
            ShemuX86SetGprValue(Context, NDR_EAX, 4, src.Value.Dwords[7], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_ECX, 4, src.Value.Dwords[6], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_EDX, 4, src.Value.Dwords[5], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_EBX, 4, src.Value.Dwords[4], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_EBP, 4, src.Value.Dwords[2], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_ESI, 4, src.Value.Dwords[1], ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_EDI, 4, src.Value.Dwords[0], ND_FALSE);
            break;

        case ND_INS_LEA:
            GET_OP(Context, 1, &src);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_SHL:
        case ND_INS_SAL:
        case ND_INS_SHR:
        case ND_INS_SAR:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            GET_OP(Context, hasNd ? 2 : 1, &src);

            if (dst.Size == 8)
            {
                src.Value.Qwords[0] &= 0x3f;
            }
            else
            {
                src.Value.Qwords[0] &= 0x1f;
            }

            res.Size = dst.Size;

            if (ND_INS_SHL == Context->Arch.X86.Instruction.Instruction || 
                ND_INS_SAL == Context->Arch.X86.Instruction.Instruction)
            {
                res.Value.Qwords[0] = dst.Value.Qwords[0] << src.Value.Qwords[0];
            }
            else if (ND_INS_SHR == Context->Arch.X86.Instruction.Instruction)
            {
                res.Value.Qwords[0] = dst.Value.Qwords[0] >> src.Value.Qwords[0];
            }
            else
            {
                ND_SINT64 val = (ND_SINT64)ND_SIGN_EX(dst.Size, dst.Value.Qwords[0]);
                val = val >> src.Value.Qwords[0];
                res.Value.Qwords[0] = (ND_UINT64)val;
            }

            if (src.Value.Qwords[0] != 0)
            {
                // 0 bit shifts do not affect the flags.
                if (ND_INS_SHL == Context->Arch.X86.Instruction.Instruction || 
                    ND_INS_SAL == Context->Arch.X86.Instruction.Instruction)
                {
                    SET_FLAGS(Context, res, dst, src, FM_SHL);
                }
                else if (ND_INS_SHR == Context->Arch.X86.Instruction.Instruction)
                {
                    SET_FLAGS(Context, res, dst, src, FM_SHR);
                }
                else
                {
                    SET_FLAGS(Context, res, dst, src, FM_SAR);
                }
            }

            SET_OP(Context, 0, &res);
            break;

        case ND_INS_RCL:
        case ND_INS_RCR:
        case ND_INS_ROL:
        case ND_INS_ROR:
            {
                ND_UINT32 cnt, tempcnt, cntmask, bitwidth;
                ND_UINT8 tempCF = 0;

                GET_OP(Context, hasNd ? 1 : 0, &dst);
                GET_OP(Context, hasNd ? 2 : 1, &src);

                cnt = (ND_UINT32)src.Value.Qwords[0];
                cntmask = ((dst.Size == 8) ? 0x3F : 0x1F);
                tempcnt = (cnt & cntmask);
                bitwidth = (ND_UINT32)dst.Size * 8;

                if (ND_INS_RCL == Context->Arch.X86.Instruction.Instruction || 
                    ND_INS_RCR == Context->Arch.X86.Instruction.Instruction)
                {
                    if (dst.Size == 1)
                    {
                        tempcnt %= 9;
                    }
                    else if (dst.Size == 2)
                    {
                        tempcnt %= 17;
                    }
                }
                else
                {
                    tempcnt %= (dst.Size * 8);
                }

                if (ND_INS_RCL == Context->Arch.X86.Instruction.Instruction)
                {
                    tempCF = GET_FLAG(Context, NDR_RFLAG_CF);

                    if (tempcnt != 0)
                    {
                        // tempcnt is in range [1, dst bit width].
                        ND_UINT64 left = (tempcnt == bitwidth) ? 0 : (dst.Value.Qwords[0] << tempcnt);
                        ND_UINT64 right = (tempcnt == 1) ? 0 : (dst.Value.Qwords[0] >> (bitwidth - tempcnt + 1));

                        SET_FLAG(Context, NDR_RFLAG_CF, ND_GET_BIT(bitwidth - tempcnt, dst.Value.Qwords[0]));

                        dst.Value.Qwords[0] = left | ((ND_UINT64)tempCF << (tempcnt - 1)) | right;
                    }

                    if ((cnt & cntmask) == 1)
                    {
                        SET_FLAG(Context, NDR_RFLAG_OF, ND_MSB(dst.Size, dst.Value.Qwords[0]) ^
                                                        GET_FLAG(Context, NDR_RFLAG_CF));
                    }
                }
                else if (ND_INS_RCR == Context->Arch.X86.Instruction.Instruction)
                {
                    tempCF = GET_FLAG(Context, NDR_RFLAG_CF);

                    if ((cnt & cntmask) == 1)
                    {
                        SET_FLAG(Context, NDR_RFLAG_OF, ND_MSB(dst.Size, dst.Value.Qwords[0]) ^
                                                        GET_FLAG(Context, NDR_RFLAG_CF));
                    }

                    if (tempcnt != 0)
                    {
                        // tempcnt is in range [1, dst bit width].
                        ND_UINT64 left = (tempcnt == bitwidth) ? 0 : (dst.Value.Qwords[0] >> tempcnt);
                        ND_UINT64 right = (tempcnt == 1) ? 0 : (dst.Value.Qwords[0] << (bitwidth - tempcnt + 1));

                        SET_FLAG(Context, NDR_RFLAG_CF, ND_GET_BIT(tempcnt - 1, dst.Value.Qwords[0]));

                        dst.Value.Qwords[0] = left | ((ND_UINT64)tempCF << (bitwidth - tempcnt)) | right;
                    }
                }
                else if (ND_INS_ROL == Context->Arch.X86.Instruction.Instruction)
                {
                    if (tempcnt != 0)
                    {
                        // tempcnt is in range [1, dst bit width - 1].
                        ND_UINT64 left = dst.Value.Qwords[0] << tempcnt;
                        ND_UINT64 right = dst.Value.Qwords[0] >> (bitwidth - tempcnt);

                        dst.Value.Qwords[0] = left | right;
                    }

                    if ((cnt & cntmask) != 0)
                    {
                        SET_FLAG(Context, NDR_RFLAG_CF, ND_LSB(dst.Size, dst.Value.Qwords[0]));
                    }

                    if ((cnt & cntmask) == 1)
                    {
                        SET_FLAG(Context, NDR_RFLAG_OF, ND_MSB(dst.Size, dst.Value.Qwords[0]) ^
                                                        GET_FLAG(Context, NDR_RFLAG_CF));
                    }
                }
                else // ND_INS_ROR
                {
                    if (tempcnt != 0)
                    {
                        // tempcnt is in range [1, dst bit width - 1].
                        ND_UINT64 left = (dst.Value.Qwords[0] >> tempcnt);
                        ND_UINT64 right = (dst.Value.Qwords[0] << (bitwidth - tempcnt));

                        dst.Value.Qwords[0] = left | right;
                    }

                    if ((cnt & cntmask) != 0)
                    {
                        SET_FLAG(Context, NDR_RFLAG_CF, ND_MSB(dst.Size, dst.Value.Qwords[0]));
                    }

                    if ((cnt & cntmask) == 1)
                    {
                        SET_FLAG(Context, NDR_RFLAG_OF, ND_MSB(dst.Size, dst.Value.Qwords[0]) ^ 
                                                        ND_GET_BIT(dst.Size * 8ULL - 2, dst.Value.Qwords[0]));
                    }
                }

                SET_OP(Context, 0, &dst);
            }
            break;

        case ND_INS_OR:
        case ND_INS_XOR:
        case ND_INS_AND:
        case ND_INS_TEST:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            GET_OP(Context, hasNd ? 2 : 1, &src);

            res.Size = dst.Size;

            if (ND_INS_OR == Context->Arch.X86.Instruction.Instruction)
            {
                res.Value.Qwords[0] = dst.Value.Qwords[0] | src.Value.Qwords[0];
            }
            else if (ND_INS_XOR == Context->Arch.X86.Instruction.Instruction)
            {
                res.Value.Qwords[0] = dst.Value.Qwords[0] ^ src.Value.Qwords[0];
            }
            else
            {
                res.Value.Qwords[0] = dst.Value.Qwords[0] & src.Value.Qwords[0];
            }

            if (ND_INS_TEST != Context->Arch.X86.Instruction.Instruction)
            {
                SET_OP(Context, 0, &res);
            }

            SET_FLAGS(Context, res, dst, src, FM_LOGIC);
            break;

        case ND_INS_NOT:
            GET_OP(Context, hasNd ? 1 : 0, &dst);
            dst.Value.Qwords[0] = ~dst.Value.Qwords[0];
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_NEG:
            GET_OP(Context, hasNd ? 1 : 0, &src);
            dst.Size = src.Size;
            dst.Value.Qwords[0] = 0;
            res.Size = src.Size;
            res.Value.Qwords[0] = dst.Value.Qwords[0] - src.Value.Qwords[0];
            SET_OP(Context, 0, &res);
            SET_FLAGS(Context, res, dst, src, FM_SUB);
            SET_FLAG(Context, NDR_RFLAG_CF, src.Value.Qwords[0] != 0);
            break;

        case ND_INS_BT:
        case ND_INS_BTS:
        case ND_INS_BTR:
        case ND_INS_BTC:
            GET_OP(Context, 0, &dst);
            GET_OP(Context, 1, &src);

            src.Value.Qwords[0] %= dst.Size * 8ULL;

            // Store the bit inside CF.
            SET_FLAG(Context, NDR_RFLAG_CF, (dst.Value.Qwords[0] >> src.Value.Qwords[0]) & 1);

            if (ND_INS_BTS == Context->Arch.X86.Instruction.Instruction)
            {
                dst.Value.Qwords[0] |= (1ULL << src.Value.Qwords[0]);
            }
            else if (ND_INS_BTR == Context->Arch.X86.Instruction.Instruction)
            {
                dst.Value.Qwords[0] &= ~(1ULL << src.Value.Qwords[0]);
            }
            else if (ND_INS_BTC == Context->Arch.X86.Instruction.Instruction)
            {
                dst.Value.Qwords[0] ^= (1ULL << src.Value.Qwords[0]);
            }

            if (ND_INS_BT != Context->Arch.X86.Instruction.Instruction)
            {
                SET_OP(Context, 0, &dst);
            }

            break;

        case ND_INS_Jcc:
            // Initial condition evaluation.
            taken = ShemuX86EvalCondition(Context, Context->Arch.X86.Instruction.Condition);

            // Track the loop. This may override the taken/not taken state.
            taken = ShemuX86TrackLoop(Context, taken);

            if (taken)
            {
                // Modify the RIP if the branch is taken.
                GET_OP(Context, 1, &res);
                res.Value.Qwords[0] += Context->Arch.X86.Instruction.Operands[0].Info.RelativeOffset.Rel;
                SET_OP(Context, 1, &res);
            }

            break;

        case ND_INS_JrCXZ:
            // Fetch the rCX value. It could be CX, ECX or RCX, depending on address size.
            GET_OP(Context, 1, &dst);

            // Initial condition evaluation.
            taken = dst.Value.Qwords[0] == 0;

            // Track the loop. This may override the taken/not taken state.
            taken = ShemuX86TrackLoop(Context, taken);

            if (taken)
            {
                // Modify the RIP if the branch is taken.
                GET_OP(Context, 2, &res);
                res.Value.Qwords[0] += Context->Arch.X86.Instruction.Operands[0].Info.RelativeOffset.Rel;
                SET_OP(Context, 2, &res);
            }

            break;

        case ND_INS_LOOP:
        case ND_INS_LOOPNZ:
        case ND_INS_LOOPZ:
            // rCX is decremented first. Note that the size depends on address size.
            GET_OP(Context, 1, &dst);
            dst.Value.Qwords[0]--;
            SET_OP(Context, 1, &dst);

            // Initial condition evaluation.
            taken = dst.Value.Qwords[0] > 0 && 
                    (((ND_INS_LOOPNZ == Context->Arch.X86.Instruction.Instruction) && (0 == GET_FLAG(Context, NDR_RFLAG_ZF))) ||
                     ((ND_INS_LOOPZ  == Context->Arch.X86.Instruction.Instruction) && (0 != GET_FLAG(Context, NDR_RFLAG_ZF))) ||
                      (ND_INS_LOOP   == Context->Arch.X86.Instruction.Instruction));

            // Track the loop. This may override the taken/not taken state.
            taken = ShemuX86TrackLoop(Context, taken);

            if (taken)
            {
                // Modify the RIP if the branch is taken.
                GET_OP(Context, 2, &res);
                res.Value.Qwords[0] += Context->Arch.X86.Instruction.Operands[0].Info.RelativeOffset.Rel;
                SET_OP(Context, 2, &res);
            }

            break;

        case ND_INS_JMPNR:
            // Track loops.
            ShemuX86TrackLoop(Context, ND_TRUE);

            GET_OP(Context, 1, &res);
            res.Value.Qwords[0] += Context->Arch.X86.Instruction.Operands[0].Info.RelativeOffset.Rel;
            SET_OP(Context, 1, &res);
            break;

        case ND_INS_JMPNI:
            GET_OP(Context, 0, &src);
            SET_OP(Context, 1, &src);   // Set the RIP to the new value.
            break;

        case ND_INS_CALLNR:
            // Save the EIP on the stack.
            GET_OP(Context, 1, &res);
            SET_OP(Context, 2, &res);
            res.Value.Qwords[0] += Context->Arch.X86.Instruction.Operands[0].Info.RelativeOffset.Rel;
            SET_OP(Context, 1, &res);
            break;

        case ND_INS_CALLNI:
            GET_OP(Context, 0, &src);
            GET_OP(Context, 1, &dst);   // The RIP
            SET_OP(Context, 2, &dst);   // Save the RIP on the stack.
            SET_OP(Context, 1, &src);   // Set the RIP to the new value.
            break;

        case ND_INS_RETN:
            if (!Context->Arch.X86.Instruction.HasImm1)
            {
                // The simple RET form, 0xC3
                GET_OP(Context, 1, &src);
                SET_OP(Context, 0, &src);
            }
            else
            {
                // The RET Imm16 form, 0xC2
                GET_OP(Context, 3, &src);
                SET_OP(Context, 1, &src);

                // Patch the RSP register.
                GET_OP(Context, 2, &res);
                res.Value.Qwords[0] += Context->Arch.X86.Instruction.Operands[0].Info.Immediate.Imm;
                SET_OP(Context, 2, &res);
            }
            break;

        case ND_INS_JMPFD:
        case ND_INS_CALLFD:
            cs = (ND_UINT16)Context->Arch.X86.Instruction.Operands[0].Info.Address.BaseSeg;
            goto check_far_branch;

        case ND_INS_JMPFI:
        case ND_INS_CALLFI:
        case ND_INS_IRET:
        case ND_INS_RETF:
            if (Context->Arch.X86.Instruction.Instruction == ND_INS_RETF)
            {
                if (Context->Arch.X86.Instruction.Operands[0].Type == ND_OP_IMM)
                {
                    // RETF imm
                    GET_OP(Context, 3, &src);
                }
                else
                {
                    // RETF
                    GET_OP(Context, 2, &src);
                }
            }
            else if (Context->Arch.X86.Instruction.Instruction == ND_INS_IRET)
            {
                // IRET
                 GET_OP(Context, 2, &src);
            }
            else
            {
                // JMP/CALL far
                GET_OP(Context, 0, &src);
            }

            // The destination code segment is the second WORD/DWORD/QWORD.
            switch (Context->Arch.X86.Instruction.WordLength)
            {
            case 2:
                cs = (ND_UINT16)src.Value.Words[1];
                break;
            case 4:
                cs = (ND_UINT16)src.Value.Dwords[1];
                break;
            case 8:
                cs = (ND_UINT16)src.Value.Qwords[1];
                break;
            default:
                cs = 0;
                break;
            }

check_far_branch:
            if (Context->Arch.X86.Mode == ND_CODE_32 && cs == 0x33)
            {
                Context->Flags |= SHEMU_FLAG_HEAVENS_GATE;
            }

            // We may, in the future, emulate far branches, but they imply some tricky context switches (including
            // the default TEB), so it may not be as straight forward as it seems. For now, all we wish to achieve 
            // is detection of far branches in long-mode, from Wow 64.
            stop = ND_TRUE;
            break;

        case ND_INS_LODS:
        case ND_INS_STOS:
        case ND_INS_MOVS:
            // Fetch the rCX register, which is the third operand in case of repeated instructions.
            while (Context->InstructionsCount < Context->MaxInstructionsCount)
            {
                // Get the RCX value.
                GET_OP(Context, 2, &dst);

                if (Context->Arch.X86.Instruction.IsRepeated && (dst.Value.Qwords[0] == 0))
                {
                    break;
                }

                // Load the source into the destination.
                GET_OP(Context, 1, &src);
                SET_OP(Context, 0, &src);

                if (Context->Arch.X86.Instruction.IsRepeated)
                {
                    // Decrement RCX.
                    dst.Value.Qwords[0]--;
                    SET_OP(Context, 2, &dst);
                }
                else
                {
                    break;
                }

                Context->InstructionsCount++;
            }
            break;

        case ND_INS_SCAS:
        case ND_INS_CMPS:
            while (Context->InstructionsCount < Context->MaxInstructionsCount)
            {
                // Get the RCX value.
                GET_OP(Context, 2, &dst);

                if (Context->Arch.X86.Instruction.IsRepeated && (dst.Value.Qwords[0] == 0))
                {
                    break;
                }

                // Move on with the source & destination.
                nd_memzero(&dst, sizeof(dst));

                GET_OP(Context, 0, &dst);
                GET_OP(Context, 1, &src);

                res.Size = dst.Size;
                res.Value.Qwords[0] = dst.Value.Qwords[0] - src.Value.Qwords[0];

                SET_FLAGS(Context, res, dst, src, FM_SUB);

                if (Context->Arch.X86.Instruction.IsRepeated)
                {
                    // Read RCX again & decrement it.
                    nd_memzero(&dst, sizeof(dst));
                    GET_OP(Context, 2, &dst);
                    dst.Value.Qwords[0]--;
                    SET_OP(Context, 2, &dst);

                    if (Context->Arch.X86.Instruction.HasRepRepzXrelease && !GET_FLAG(Context, NDR_RFLAG_ZF))
                    {
                        break;
                    }

                    if (Context->Arch.X86.Instruction.HasRepnzXacquireBnd && GET_FLAG(Context, NDR_RFLAG_ZF))
                    {
                        break;
                    }
                }
                else
                {
                    break;
                }

                Context->InstructionsCount++;
            }
            break;

        case ND_INS_MUL:
        case ND_INS_IMUL:
            if (Context->Arch.X86.Instruction.ExpOperandsCount == 1)
            {
                // MUL or IMUL with a single explicit operand.
                GET_OP(Context, 0, &dst);
                GET_OP(Context, 1, &src);
                res.Size = dst.Size * 2;
            }
            else if (Context->Arch.X86.Instruction.ExpOperandsCount == 2)
            {
                // IMUL with 2 explicit operands.
                GET_OP(Context, 0, &dst);
                GET_OP(Context, 1, &src);
                res.Size = dst.Size;
            }
            else
            {
                // IMUL with 3 operands. The first operand is the write-only destination.
                // This also covers the {ND} form, which is equivalent to the 3 operand form.
                GET_OP(Context, 1, &dst);
                GET_OP(Context, 2, &src);
                res.Size = dst.Size;
            }


            if (dst.Size == 1)
            {
                if (ND_INS_MUL == Context->Arch.X86.Instruction.Instruction)
                {
                    res.Value.Words[0] = dst.Value.Bytes[0] * src.Value.Bytes[0];
                }
                else
                {
                    res.Value.Words[0] = (ND_UINT16)((ND_SINT8)dst.Value.Bytes[0] * (ND_SINT8)src.Value.Bytes[0]);
                }
            }
            else if (dst.Size == 2)
            {
                if (ND_INS_MUL == Context->Arch.X86.Instruction.Instruction)
                {
                    res.Value.Dwords[0] = dst.Value.Words[0] * src.Value.Words[0];
                }
                else
                {
                    res.Value.Dwords[0] = (ND_UINT32)((ND_SINT16)dst.Value.Words[0] * (ND_SINT16)src.Value.Words[0]);
                }
            }
            else if (dst.Size == 4)
            {
                if (ND_INS_MUL == Context->Arch.X86.Instruction.Instruction)
                {
                    res.Value.Qwords[0] = dst.Value.Qwords[0] * src.Value.Qwords[0];
                }
                else
                {
                    res.Value.Qwords[0] = (ND_UINT64)((ND_SINT64)(ND_SINT32)dst.Value.Dwords[0] * (ND_SINT64)(ND_SINT32)src.Value.Dwords[0]);
                }
            }
            else
            {
                if (ND_INS_MUL == Context->Arch.X86.Instruction.Instruction)
                {
                    ShemuX86Multiply64Unsigned(&dst, &src, &res);
                }
                else
                {
                    ShemuX86Multiply64Signed(&dst, &src, &res);
                }
            }

            if (Context->Arch.X86.Instruction.ExpOperandsCount == 1)
            {
                // The result is stored in AX, DX:AX, EDX:EAX or RDX:RAX for the single-operand form.
                switch (dst.Size)
                {
                case 1:
                    ShemuX86SetGprValue(Context, NDR_AX, 2, res.Value.Words[0], ND_FALSE);
                    break;
                case 2:
                    ShemuX86SetGprValue(Context, NDR_DX, 2, res.Value.Words[1], ND_FALSE);
                    ShemuX86SetGprValue(Context, NDR_AX, 2, res.Value.Words[0], ND_FALSE);
                    break;
                case 4:
                    ShemuX86SetGprValue(Context, NDR_EDX, 4, res.Value.Dwords[1], ND_FALSE);
                    ShemuX86SetGprValue(Context, NDR_EAX, 4, res.Value.Dwords[0], ND_FALSE);
                    break;
                case 8:
                    ShemuX86SetGprValue(Context, NDR_RDX, 8, res.Value.Qwords[1], ND_FALSE);
                    ShemuX86SetGprValue(Context, NDR_RAX, 8, res.Value.Qwords[0], ND_FALSE);
                    break;
                }
            }
            else
            {
                // The result is truncated and stored in the destination operand for the 2 & 3 operands forms.
                SET_OP(Context, 0, &res);
            }

            // Set the flags.
            if (ND_INS_MUL == Context->Arch.X86.Instruction.Instruction)
            {
                ND_UINT8 cfof = 0;

                // CF and OF are set to 0 if the high part of the result is 0, otherwise they are set to 1.
                switch (dst.Size)
                {
                case 1:
                    cfof = (0 == res.Value.Bytes[1]) ? 0 : 1;
                    break;
                case 2:
                    cfof = (0 == res.Value.Words[1]) ? 0 : 1;
                    break;
                case 4:
                    cfof = (0 == res.Value.Dwords[1]) ? 0 : 1;
                    break;
                case 8:
                    cfof = (0 == res.Value.Qwords[1]) ? 0 : 1;
                    break;
                }

                SET_FLAG(Context, NDR_RFLAG_CF, cfof);
                SET_FLAG(Context, NDR_RFLAG_OF, cfof);
            }
            else
            {
                // The CF and OF flags are set when the signed integer value of the intermediate product differs from
                // the sign extended operand - size - truncated product, otherwise the CF and OF flags are cleared.
                ND_UINT8 cfof = 0, sign = 0;

                sign = ND_MSB(dst.Size, res.Value.Qwords[0]);

                switch (dst.Size)
                {
                case 1:
                    cfof = (0 == res.Value.Bytes[1] && 0 == sign) || 
                           ((ND_UINT8)-1 == res.Value.Bytes[1] && 1 == sign) ? 0 : 1;
                    break;
                case 2:
                    cfof = (0 == res.Value.Words[1] && 0 == sign) || 
                           ((ND_UINT16)-1 == res.Value.Words[1] && 1 == sign) ? 0 : 1;
                    break;
                case 4:
                    cfof = (0 == res.Value.Dwords[1] && 0 == sign) || 
                           ((ND_UINT32)-1 == res.Value.Dwords[1] && 1 == sign) ? 0 : 1;
                    break;
                case 8:
                    cfof = (0 == res.Value.Qwords[1] && 0 == sign) || 
                           ((ND_UINT64)-1 == res.Value.Qwords[1] && 1 == sign) ? 0 : 1;
                    break;
                }

                SET_FLAG(Context, NDR_RFLAG_CF, cfof);
                SET_FLAG(Context, NDR_RFLAG_OF, cfof);
            }

            break;

        case ND_INS_DIV:
        case ND_INS_IDIV:
            // DIV and IDIV only exist with a single explicit operand encoding. All flags are undefined.
            // No {ND} form for DIV/IDIV.
            GET_OP(Context, 0, &src);

            if (src.Size == 1)
            {
                ND_UINT16 divident;

                divident = (ND_UINT16)ShemuX86GetGprValue(Context, NDR_AX, 2, ND_FALSE);

                if (ND_INS_DIV == Context->Arch.X86.Instruction.Instruction)
                {
                    if (!ShemuX86CheckDiv(divident, src.Value.Bytes[0], 1))
                    {
                        stop = ND_TRUE;
                        break;
                    }

                    res.Value.Bytes[0] = (ND_UINT8)(divident / src.Value.Bytes[0]);
                    res.Value.Bytes[1] = (ND_UINT8)(divident % src.Value.Bytes[0]);
                }
                else
                {
                    if (!ShemuX86CheckIdiv((ND_SINT64)(ND_SINT16)divident, (ND_SINT64)(ND_SINT8)src.Value.Bytes[0], 1))
                    {
                        stop = ND_TRUE;
                        break;
                    }

                    res.Value.Bytes[0] = (ND_UINT8)(ND_SINT8)((ND_SINT16)divident / (ND_SINT8)src.Value.Bytes[0]);
                    res.Value.Bytes[1] = (ND_UINT8)(ND_SINT8)((ND_SINT16)divident % (ND_SINT8)src.Value.Bytes[0]);
                }

                // Result in AX (AL - quotient, AH - reminder).
                ShemuX86SetGprValue(Context, NDR_AX, 2, res.Value.Words[0], ND_FALSE);
            }
            else if (src.Size == 2)
            {
                ND_UINT32 divident;

                divident = ((ND_UINT32)ShemuX86GetGprValue(Context, NDR_EDX, 2, ND_FALSE) << 16) |
                            (ND_UINT32)ShemuX86GetGprValue(Context, NDR_EAX, 2, ND_FALSE);

                if (ND_INS_DIV == Context->Arch.X86.Instruction.Instruction)
                {
                    if (!ShemuX86CheckDiv(divident, src.Value.Words[0], 2))
                    {
                        stop = ND_TRUE;
                        break;
                    }

                    res.Value.Words[0] = (ND_UINT16)(divident / src.Value.Words[0]);
                    res.Value.Words[1] = (ND_UINT16)(divident % src.Value.Words[0]);
                }
                else
                {
                    if (!ShemuX86CheckIdiv((ND_SINT64)(ND_SINT32)divident, (ND_SINT64)(ND_SINT16)src.Value.Words[0], 2))
                    {
                        stop = ND_TRUE;
                        break;
                    }

                    res.Value.Words[0] = (ND_UINT16)(ND_SINT16)((ND_SINT32)divident / (ND_SINT16)src.Value.Words[0]);
                    res.Value.Words[1] = (ND_UINT16)(ND_SINT16)((ND_SINT32)divident % (ND_SINT16)src.Value.Words[0]);
                }

                ShemuX86SetGprValue(Context, NDR_DX, 2, res.Value.Words[1], ND_FALSE);
                ShemuX86SetGprValue(Context, NDR_AX, 2, res.Value.Words[0], ND_FALSE);
            }
            else if (src.Size == 4)
            {
                ND_UINT64 divident;

                divident = ((ND_UINT64)ShemuX86GetGprValue(Context, NDR_EDX, 4, ND_FALSE) << 32) |
                            (ND_UINT64)ShemuX86GetGprValue(Context, NDR_EAX, 4, ND_FALSE);

                if (ND_INS_DIV == Context->Arch.X86.Instruction.Instruction)
                {
                    if (!ShemuX86CheckDiv(divident, src.Value.Dwords[0], 4))
                    {
                        stop = ND_TRUE;
                        break;
                    }

                    res.Value.Dwords[0] = (ND_UINT32)(divident / src.Value.Dwords[0]);
                    res.Value.Dwords[1] = (ND_UINT32)(divident % src.Value.Dwords[0]);
                }
                else
                {
                    if (!ShemuX86CheckIdiv((ND_SINT64)divident, (ND_SINT64)(ND_SINT32)src.Value.Dwords[0], 4))
                    {
                        stop = ND_TRUE;
                        break;
                    }

                    res.Value.Dwords[0] = (ND_UINT32)(ND_SINT32)((ND_SINT64)divident / (ND_SINT32)src.Value.Dwords[0]);
                    res.Value.Dwords[1] = (ND_UINT32)(ND_SINT32)((ND_SINT64)divident % (ND_SINT32)src.Value.Dwords[0]);
                }

                ShemuX86SetGprValue(Context, NDR_EDX, 4, res.Value.Dwords[1], ND_FALSE);
                ShemuX86SetGprValue(Context, NDR_EAX, 4, res.Value.Dwords[0], ND_FALSE);
            }
            else if (src.Size == 8)
            {
                /// Not implemented!
            }

            break;

        case ND_INS_CLD:
            SET_FLAG(Context, NDR_RFLAG_DF, 0);
            break;

        case ND_INS_STD:
            SET_FLAG(Context, NDR_RFLAG_DF, 1);
            break;

        case ND_INS_CLC:
            SET_FLAG(Context, NDR_RFLAG_CF, 0);
            break;

        case ND_INS_STC:
            SET_FLAG(Context, NDR_RFLAG_CF, 1);
            break;

        case ND_INS_CMC:
            Context->Arch.X86.Registers.RegFlags ^= NDR_RFLAG_CF;
            break;

        case ND_INS_STI:
            if (Context->Arch.X86.Ring != 0)
            {
                return SHEMU_ABORT_NO_PRIVILEGE;
            }

            SET_FLAG(Context, NDR_RFLAG_IF, 1);
            break;

        case ND_INS_CLI:
            if (Context->Arch.X86.Ring != 0)
            {
                return SHEMU_ABORT_NO_PRIVILEGE;
            }

            SET_FLAG(Context, NDR_RFLAG_IF, 0);
            break;

        case ND_INS_SAHF:
            {
                ND_UINT8 ah = (ND_UINT8)ShemuX86GetGprValue(Context, NDR_AH, 1, ND_TRUE);
                // Handle reserved bits.
                ah |= (1 << 1);
                ah &= ~((1 << 3) | (1 << 5));
                ((ND_UINT8 *)&Context->Arch.X86.Registers.RegFlags)[0] = ah;
            }
            break;

        case ND_INS_LAHF:
            {
                ND_UINT8 ah = ((ND_UINT8 *)&Context->Arch.X86.Registers.RegFlags)[0];
                ShemuX86SetGprValue(Context, NDR_AH, 1, ah, ND_TRUE);
            }
            break;

        case ND_INS_SALC:
            if (GET_FLAG(Context, NDR_RFLAG_CF))
            {
                ShemuX86SetGprValue(Context, NDR_AL, 1, 0xFF, ND_FALSE);
            }
            else
            {
                ShemuX86SetGprValue(Context, NDR_AL, 1, 0x00, ND_FALSE);
            }
            break;

        case ND_INS_NOP:
            if (sled)
            {
                Context->NopCount++;
            }
            break;

        case ND_INS_WAIT:
            break;

        case ND_INS_CBW:
        case ND_INS_CWDE:
        case ND_INS_CDQE:
            GET_OP(Context, 1, &src);
            dst.Size = src.Size * 2;
            dst.Value.Qwords[0] = ND_SIGN_EX(src.Size, src.Value.Qwords[0]);
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_CWD:
        case ND_INS_CDQ:
        case ND_INS_CQO:
            GET_OP(Context, 1, &src);
            dst.Size = src.Size;
            if (ND_GET_SIGN(src.Size, src.Value.Qwords[0]))
            {
                dst.Value.Qwords[0] = 0xFFFFFFFFFFFFFFFF;
            }
            else
            {
                dst.Value.Qwords[0] = 0;
            }
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_BSWAP:
            GET_OP(Context, 0, &src);

            dst.Size = src.Size;

            switch (src.Size)
            {
            case 2:
                // Although undefined, when executing BSWAP with 16 bit operands, the result is set to 0.
                dst.Value.Words[0] = 0;
                break;
            case 4:
                dst.Value.Bytes[3] = src.Value.Bytes[0];
                dst.Value.Bytes[2] = src.Value.Bytes[1];
                dst.Value.Bytes[1] = src.Value.Bytes[2];
                dst.Value.Bytes[0] = src.Value.Bytes[3];
                break;
            case 8:
                dst.Value.Bytes[7] = src.Value.Bytes[0];
                dst.Value.Bytes[6] = src.Value.Bytes[1];
                dst.Value.Bytes[5] = src.Value.Bytes[2];
                dst.Value.Bytes[4] = src.Value.Bytes[3];
                dst.Value.Bytes[3] = src.Value.Bytes[4];
                dst.Value.Bytes[2] = src.Value.Bytes[5];
                dst.Value.Bytes[1] = src.Value.Bytes[6];
                dst.Value.Bytes[0] = src.Value.Bytes[7];
            default:
                break;
            }

            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_BSF:
        case ND_INS_BSR:
            // No APX form.
            GET_OP(Context, 1, &src);

            if (src.Value.Qwords[0] == 0)
            {
                SET_FLAG(Context, NDR_RFLAG_ZF, 1);
            }
            else
            {
                SET_FLAG(Context, NDR_RFLAG_ZF, 0);

                dst.Size = src.Size;
                dst.Value.Qwords[0] = ShemuX86CountZeroBits(src.Value.Qwords[0], 
                                                            src.Size, 
                                                            Context->Arch.X86.Instruction.Instruction == ND_INS_BSF);

                if (Context->Arch.X86.Instruction.Instruction == ND_INS_BSR)
                {
                    dst.Value.Qwords[0] = src.Size * 8ULL - dst.Value.Qwords[0] - 1;
                }

                SET_OP(Context, 0, &dst);
            }
            break;

        case ND_INS_POPCNT:
            // No {ND} form.
            GET_OP(Context, 1, &src);

            dst.Size = src.Size;
            dst.Value.Qwords[0] = 0;

            for (ND_UINT32 bit = 0; bit < src.Size * 8; bit++)
            {
                if (ND_GET_BIT(bit, src.Value.Qwords[0]))
                {
                    dst.Value.Qwords[0]++;
                }
            }

            SET_FLAG(Context, NDR_RFLAG_OF, 0);
            SET_FLAG(Context, NDR_RFLAG_SF, 0);
            SET_FLAG(Context, NDR_RFLAG_ZF, 0);
            SET_FLAG(Context, NDR_RFLAG_CF, 0);
            SET_FLAG(Context, NDR_RFLAG_PF, 0);
            SET_FLAG(Context, NDR_RFLAG_ZF, src.Value.Qwords[0] == 0 ? 1 : 0);

            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_LZCNT:
        case ND_INS_TZCNT:
            // No {ND} form.
            GET_OP(Context, 1, &src);

            dst.Size = src.Size;
            dst.Value.Qwords[0] = ShemuX86CountZeroBits(src.Value.Qwords[0], 
                                                        src.Size, 
                                                        Context->Arch.X86.Instruction.Instruction == ND_INS_TZCNT);

            // Set CF.
            if (dst.Value.Qwords[0] == src.Size * 8ULL)
            {
                SET_FLAG(Context, NDR_RFLAG_CF, 1);
            }
            else
            {
                SET_FLAG(Context, NDR_RFLAG_CF, 0);
            }

            // Set ZF.
            if (dst.Value.Qwords[0] == 0)
            {
                SET_FLAG(Context, NDR_RFLAG_ZF, 1);
            }
            else
            {
                SET_FLAG(Context, NDR_RFLAG_ZF, 0);
            }

            // Set the result.
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_SHLD:
        case ND_INS_SHRD:
            {
                SHEMU_VALUE cnt = { 0 };

                GET_OP(Context, hasNd ? 1 : 0, &dst);
                GET_OP(Context, hasNd ? 2 : 1, &src);
                GET_OP(Context, hasNd ? 3 : 2, &cnt);

                res.Size = dst.Size;

                if (dst.Size == 8)
                {
                    cnt.Value.Qwords[0] &= 0x3f;
                }
                else
                {
                    cnt.Value.Qwords[0] &= 0x1f;
                }

                if (cnt.Value.Qwords[0] == 0)
                {
                    // 0-shift, destination & flags are not affected.
                }
                else if (cnt.Value.Qwords[0] > dst.Size * 8ull)
                {
                    // Shift count larger that operand size, destination & flags are undefined. Store 0 in destination.
                    SET_OP(Context, 0, &res);
                }
                else
                {
                    // Positive shift, count less than operand size.
                    if (ND_INS_SHLD == Context->Arch.X86.Instruction.Instruction)
                    {
                        res.Value.Qwords[0] = dst.Value.Qwords[0] << cnt.Value.Qwords[0];

                        res.Value.Qwords[0] |= src.Value.Qwords[0] >> (dst.Size * 8ull - cnt.Value.Qwords[0]);

                        SET_FLAGS(Context, res, dst, cnt, FM_SHL);
                    }
                    else
                    {
                        res.Value.Qwords[0] = dst.Value.Qwords[0] >> cnt.Value.Qwords[0];

                        res.Value.Qwords[0] |= src.Value.Qwords[0] << (dst.Size * 8ull - cnt.Value.Qwords[0]);

                        SET_FLAGS(Context, res, dst, cnt, FM_SHR);
                    }

                    SET_OP(Context, 0, &res);
                }
            }
            break;

        case ND_INS_PREFETCH:
        case ND_INS_PREFETCHE:
        case ND_INS_PREFETCHM:
        case ND_INS_PREFETCHNTA:
        case ND_INS_PREFETCHT0:
        case ND_INS_PREFETCHT1:
        case ND_INS_PREFETCHT2:
        case ND_INS_PREFETCHW:
        case ND_INS_PREFETCHWT1:
            // Act as NOPs, they're just hints to the hardware prefetchers.
            break;

        case ND_INS_ENDBR:
            // Acts as a NOP, it's just a hint to the decoder.
            break;

        case ND_INS_LFENCE:
        case ND_INS_SFENCE:
        case ND_INS_MFENCE:
            // Nothing can be done for them, really.
            break;

        case ND_INS_CPUID:
            // OK; EAX, EBX, ECX and EDX are modified, which also zeroes the high 32 bit.
            ShemuX86SetGprValue(Context, NDR_RAX, 8, 0, ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_RCX, 8, 0, ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_RDX, 8, 0, ND_FALSE);
            ShemuX86SetGprValue(Context, NDR_RBX, 8, 0, ND_FALSE);
            break;

        // Some basic MMX/SSE instructions supported.
        case ND_INS_EMMS:
            nd_memzero(Context->Arch.X86.MmxRegisters, sizeof(Context->Arch.X86.MmxRegisters));
            break;

        case ND_INS_MOVD:
        case ND_INS_MOVQ:
        case ND_INS_MOVDQU:
        case ND_INS_MOVDQA:
            // If the source size is less than the destination size, the upper bits will be zero.
            // Note that we don't really care about #GP on unaligned MOVDQA accesses...
            GET_OP(Context, 1, &src);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_PUNPCKLBW:
            GET_OP(Context, 0, &dst);
            GET_OP(Context, 1, &src);
            if (dst.Size == 8)
            {
                // Operating on MMX register.
                dst.Value.Bytes[7] = src.Value.Bytes[3];
                dst.Value.Bytes[6] = dst.Value.Bytes[3];
                dst.Value.Bytes[5] = src.Value.Bytes[2];
                dst.Value.Bytes[4] = dst.Value.Bytes[2];
                dst.Value.Bytes[3] = src.Value.Bytes[1];
                dst.Value.Bytes[2] = dst.Value.Bytes[1];
                dst.Value.Bytes[1] = src.Value.Bytes[0];
            }
            else
            {
                // Operating on XMM register.
                dst.Value.Bytes[15] = src.Value.Bytes[7];
                dst.Value.Bytes[14] = dst.Value.Bytes[7];
                dst.Value.Bytes[13] = src.Value.Bytes[6];
                dst.Value.Bytes[12] = dst.Value.Bytes[6];
                dst.Value.Bytes[11] = src.Value.Bytes[5];
                dst.Value.Bytes[10] = dst.Value.Bytes[5];
                dst.Value.Bytes[9] = src.Value.Bytes[4];
                dst.Value.Bytes[8] = dst.Value.Bytes[4];
                dst.Value.Bytes[7] = src.Value.Bytes[3];
                dst.Value.Bytes[6] = dst.Value.Bytes[3];
                dst.Value.Bytes[5] = src.Value.Bytes[2];
                dst.Value.Bytes[4] = dst.Value.Bytes[2];
                dst.Value.Bytes[3] = src.Value.Bytes[1];
                dst.Value.Bytes[2] = dst.Value.Bytes[1];
                dst.Value.Bytes[1] = src.Value.Bytes[0];
            }
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_PXOR:
            GET_OP(Context, 0, &dst);
            GET_OP(Context, 1, &src);
            for (i = 0; i < dst.Size; i++)
            {
                dst.Value.Bytes[i] ^= src.Value.Bytes[i];
            }
            SET_OP(Context, 0, &dst);
            break;

        // Some basic AVX/AVX2 instructions support.
        case ND_INS_VMOVD:
        case ND_INS_VMOVQ:
        case ND_INS_VMOVDQU:
        case ND_INS_VMOVDQA:
            GET_OP(Context, 1, &src);
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_VPBROADCASTB:
        case ND_INS_VPBROADCASTW:
        case ND_INS_VPBROADCASTD:
        case ND_INS_VPBROADCASTQ:
            GET_OP(Context, 1, &src);
            dst.Size = Context->Arch.X86.Instruction.Operands[0].Size;
            for (i = 0; i < dst.Size / src.Size; i++)
            {
                switch (src.Size)
                {
                case 1:
                    dst.Value.Bytes[i] = src.Value.Bytes[0];
                    break;
                case 2:
                    dst.Value.Words[i] = src.Value.Words[0];
                    break;
                case 4:
                    dst.Value.Dwords[i] = src.Value.Dwords[0];
                    break;
                default:
                    dst.Value.Qwords[i] = src.Value.Qwords[0];
                    break;
                }
            }
            SET_OP(Context, 0, &dst);
            break;

        case ND_INS_VPXOR:
            GET_OP(Context, 1, &dst);
            GET_OP(Context, 2, &src);
            for (i = 0; i < dst.Size; i++)
            {
                dst.Value.Bytes[i] ^= src.Value.Bytes[i];
            }
            SET_OP(Context, 0, &dst);
            break;

        // Software interrupt/SYSCALL/SYSENTER.
        case ND_INS_INT:
            if ((Context->Arch.X86.Instruction.Immediate1 == 0x80 ||
                 Context->Arch.X86.Instruction.Immediate1 == 0x2E) &&
                Context->Arch.X86.Registers.RegRax < 0x1000)
            {
                Context->Flags |= SHEMU_FLAG_SYSCALL;
            }

            // Fall through
        case ND_INS_INT1:
        case ND_INS_INT3:
        case ND_INS_INTO:
            stop = ND_TRUE;
            break;

        case ND_INS_SYSCALL:
        case ND_INS_SYSENTER:
            if (Context->Arch.X86.Registers.RegRax < 0x1000)
            {
                Context->Flags |= SHEMU_FLAG_SYSCALL;
            }
            stop = ND_TRUE;
            break;

        // Some basic privileged instructions supported, specific to kernel-mode shellcodes.
        case ND_INS_SWAPGS:
            if (Context->Arch.X86.Ring != 0)
            {
                return SHEMU_ABORT_NO_PRIVILEGE;
            }

            Context->Flags |= SHEMU_FLAG_SWAPGS;
            stop = ND_TRUE;
            break;

        case ND_INS_RDMSR:
            if (Context->Arch.X86.Ring != 0)
            {
                return SHEMU_ABORT_NO_PRIVILEGE;
            }

            // Fetch ECX value.
            GET_OP(Context, 2, &src);

            if ((src.Value.Dwords[0] == 0xC0000082 && ND_CODE_64 == Context->Arch.X86.Mode) ||
                (src.Value.Dwords[0] == 0x00000176 && ND_CODE_32 == Context->Arch.X86.Mode))
            {
                Context->Flags |= SHEMU_FLAG_SYSCALL_MSR_READ;
            }

            stop = ND_TRUE;
            break;

        case ND_INS_WRMSR:
            if (Context->Arch.X86.Ring != 0)
            {
                return SHEMU_ABORT_NO_PRIVILEGE;
            }

            // Fetch ECX value.
            GET_OP(Context, 2, &src);

            if ((src.Value.Dwords[0] == 0xC0000082 && ND_CODE_64 == Context->Arch.X86.Mode) ||
                (src.Value.Dwords[0] == 0x00000176 && ND_CODE_32 == Context->Arch.X86.Mode))
            {
                Context->Flags |= SHEMU_FLAG_SYSCALL_MSR_WRITE;
            }

            stop = ND_TRUE;
            break;

        case ND_INS_SIDT:
            if (Context->Arch.X86.Ring == 0)
            {
                // Flag this only in ring0, as we treat the SHEMU_FLAG_SIDT as a ring0 specific indicator - it can be
                // used to locate the kernel image.
                Context->Flags |= SHEMU_FLAG_SIDT;
            }

            stop = ND_TRUE;
            break;

#if defined(ND_ARCH_X64) || defined(ND_ARCH_X86)
        case ND_INS_AESIMC:
        case ND_INS_AESDEC:
        case ND_INS_AESDECLAST:
        {
            __m128i val, key;

            // Make sure AES support is present, and we can emulate AES decryption using AES instructions.
            if (0 == (Context->Options & SHEMU_OPT_SUPPORT_AES))
            {
                break;
            }

            GET_OP(Context, 0, &dst);
            GET_OP(Context, 1, &src);

            shemu_memcpy(&val, &dst, 16);
            shemu_memcpy(&key, &src, 16);

            if (Context->Arch.X86.Instruction.Instruction == ND_INS_AESDEC)
            {
                val = _mm_aesdec_si128(val, key);
            }
            else if (Context->Arch.X86.Instruction.Instruction == ND_INS_AESDECLAST)
            {
                val = _mm_aesdeclast_si128(val, key);
            }
            else if (Context->Arch.X86.Instruction.Instruction == ND_INS_AESIMC)
            {
                val = _mm_aesimc_si128(key);
            }

            shemu_memcpy(&dst, &val, 16);

            SET_OP(Context, 0, &dst);
            break;
        }
#endif

        case ND_INS_RDTSC:
            src.Size = 4;
            // Set EAX to lower 32 bits.
            src.Value.Dwords[0] = tsc & 0xFFFFFFFF;
            SET_OP(Context, 0, &src);
            // Set EDX to upper 32 bits.
            src.Value.Dwords[0] = tsc >> 32;
            SET_OP(Context, 1, &src);
            break;

        case ND_INS_RDFSBASE:
        case ND_INS_RDGSBASE:
            src.Size = Context->Arch.X86.Instruction.Operands[0].Size;
            src.Value.Qwords[0] = Context->TibBase;
            SET_OP(Context, 0, &src);
            break;

        case ND_INS_UD0:
        case ND_INS_UD1:
        case ND_INS_UD2:
            stop = ND_TRUE;
            break;

        default:
            return SHEMU_ABORT_INSTRUX_NOT_SUPPORTED;

            break;
        }
    }

    // Minimum percent of the instructions were NOPs => consider we have a NOP sled. Note that we get here only if
    // the maximum number of instructions has been emulated successfully; if the emulation is aborted for any reason,
    // this code will have no effect.
    if ((Context->InstructionsCount >= Context->MaxInstructionsCount / 2) && 
        (Context->NopCount >= Context->InstructionsCount * Context->NopThreshold / 100))
    {
        Context->Flags |= SHEMU_FLAG_NOP_SLED;
    }

    return SHEMU_SUCCESS;
}