1
0
mirror of https://github.com/bitdefender/bddisasm.git synced 2024-11-18 13:38:07 +00:00
bddisasm/isagenerator/disasmlib.py
Andrei Vlad LUTAS ed564dba32 Specifically flag multi-byte NOP operands as not-accessed.
New capability - bddisasm can now be instructed whether to decode some instructions as NOPs are as MPX/CET/CLDEMOTE. This is the case for instructions that are mapped onto the wide NOP space: in that case, an encoding might be NOP if the feature is off, but might be something else (even #UD) if the feature is on.
Added NdDecodeWithContext API - this becomes the base decode API; it received the input information filled in a ND_CONTEXT structure, whih has to be initialized only once, and can be reused across calls. The NdInitContext function must be used to initialize the context, as it ensures backwards compatibility by filling new options with default values.
Improvements to the README file.
2020-07-30 11:07:14 +03:00

1365 lines
58 KiB
Python

#
# Copyright (c) 2020 Bitdefender
# SPDX-License-Identifier: Apache-2.0
#
import os
import sys
import re
import glob
valid_attributes = {
'MODRM', # Mod r/m is present.
'II64', # Instruction invalid in 64 bit mode.
'F64', # Operand size forced to 64 bit.
'D64', # Operand size defaults to 64 bit.
'O64', # Instruction valid only in 64 bit mode.
'SSECONDB', # Instruction has condition byte.
'COND', # Instruction has predicated encoded in lower 4 bit of the opcode.
'VSIB', # Instruction uses VSIB addressing.
'MIB', # Instruction uses MIB addressing.
'LIG', # *vex.L is ignored.
'WIG', # *vex.W is ignored.
'3DNOW', # Instruction uses 3dnow encoding.
'MMASK', # Instruction must have mask specified (mask cannot be k0).
'NOMZ', # Zeroing not allowed with memory addressing.
'LOCKSP', # Special lock - MOV CR on amd can use LOCK to access CR8 in 32 bit mode.
'NOL0', # Vector length 128 not supported.
'NOA16', # 16 bit addressing not supported.
'NO66', # 0x66 prefix causes #UD.
'NORIPREL', # RIP relative addressing not supported.
'VECT', # Vector instruction.
'S66', # 0x66 prefix changes length even if it is in special map (66, f2, f3).
'BITBASE', # Instruction uses bitbase addressing.
'AG', # Instruction uses address generation, no memory access.
'SHS', # Instruction accesses the shadow stack.
'MFR', # The Mod inside Mod R/M is forced to register. No SIB/disp present.
'CETT', # Instruction is CET tracked.
'OP1DEF', # Operand 1 is default (implicit).
'OP2DEF', # Operand 2 is default (implicit).
'OP2SEXO1', # Operand 2 is sign-extended to the size of the first operand.
'OP3SEXO1', # Operand 3 is sign-extended to the size of the first operand.
'OP1SEXDW', # Operand 1 is sign-extended to the size of the default word.
'PREFIX', # Prefix.
'SERIAL', # Instruction is serializing.
'SIBMEM', # Instruction uses sibmem addressing (AMX instructions).
'I67', # Ignore the address size override (0x67) prefix in 64 bit mode.
'IER', # Ignore embedded rounding for the instruction.
}
#
# Explicit operands types.
#
valid_optype = [
'A', # Direct address: the instruction has no ModR/M byte; the address of the
# operand is encoded in the instruction. No base register, index register,
# or scaling factor can be applied (for example, far JMP (EA)).
'B', # The VEX.vvvv field of the VEX prefix selects a general purpose register.
'C', # The reg field of the ModR/M byte selects a control register (for example,
# MOV (0F20, 0F22)).
'D', # The reg field of the ModR/M byte selects a debug register (for example,
# MOV (0F21,0F23)).
'E', # A ModR/M byte follows the opcode and specifies the operand. The operand
# is either a general-purpose register or a memory address. If it is a
# memory address, the address is computed from a segment register and any
# of the following values: a base register, an index register, a scaling
# factor, a displacement.
'F', # EFLAGS/RFLAGS Register.
'G', # The reg field of the ModR/M byte selects a general register (for example,
# AX (000)).
'H', # The VEX.vvvv field of the VEX prefix selects a 128-bit XMM register or a
# 256-bit YMM register, determined by operand type. For legacy SSE
# encodings this operand does not exist, changing the instruction to
# destructive form. Addition: 512 bit ZMM register may also be selected in
# EVEX encodings.
'I', # Immediate data: the operand value is encoded in subsequent bytes of the
# instruction.
'J', # The instruction contains a relative offset to be added to the instruction
# pointer register (for example, JMP (0E9), LOOP).
'K', # The operand is the stack.
'L', # The upper 4 bits of the 8-bit immediate selects a 128-bit XMM register
# or a 256-bit YMM register, determined by operand type. (the MSB is
# ignored in 32-bit mode). Addition: a 512 bit ZMM register may also be
# selected using EVEX encoding.
'M', # The ModR/M byte may refer only to memory (for example, BOUND, LES, LDS,
# LSS, LFS, LGS, CMPXCHG8B).
'N', # The R/M field of the ModR/M byte selects a packed-quadword, MMX
# technology register.
'O', # The instruction has no ModR/M byte. The offset of the operand is coded
# as a word or double word (depending on address size attribute) in the
# instruction. No base register, index register, or scaling factor can be
# applied (for example, MOV (A0-A3)).
'P', # The reg field of the ModR/M byte selects a packed quadword MMX technology
# register.
'Q', # A ModR/M byte follows the opcode and specifies the operand. The operand
# is either an MMX technology register or a memory address. If it is a
# memory address, the address is computed from a segment register and any
# of the following values: a base register, an index register, a scaling
# factor, and a displacement.
'R', # The R/M field of the ModR/M byte may refer only to a general register
# (for example, MOV (0F20-0F23)).
'S', # The reg field of the ModR/M byte selects a segment register (for example, MOV (8C,8E)).
'T', # The reg field of the ModR/M byte selects a test register (for example, MOV (0F24, 0F26)).
'U', # The R/M field of the ModR/M byte selects a 128-bit XMM register or a 256-bit YMM register,
# determined by operand type. Addition: a 512-bit ZMM register may also be selected using EVEX
# encodings.
'V', # The reg field of the ModR/M byte selects a 128-bit XMM register or a 256-bit YMM register,
# determined by operand type. Addition: a 512-bit ZMM register may also be selected using
# EVEX encodings.
'W', # A ModR/M byte follows the opcode and specifies the operand. The operand is either a 128-bit
# XMM register, a 256-bit YMM register (determined by operand type), or a memory address. If it is
# a memory address, the address is computed from a segment register and any of the following values:
# a base register, an index register, a scaling factor, and a displacement. Addition:a 512-bit ZMM
# register may also be selected # using EVEX encodings.
'X', # Memory addressed by the DS:rSI register pair (for example, MOVS, CMPS, OUTS, or LODS).
'Y', # Memory addressed by the ES:rDI register pair (for example, MOVS, CMPS, INS, STOS, or SCAS).
'Z', # The low 3 bits inside the opcode select a general purpose register. R field inside REX may
# extend it.
'rB', # The reg field selects a BND register.
'mB', # The rm field selects A BND register or a memory location.
'rK', # The reg field selects a mask register.
'vK', # The vvvv field of the VEX prefix selects a mask register.
'mK', # The rm field selects e mask register.
'aK', # The aaa field inside evex selects a mask register which is used for masking of a destination
# operand.
'rM', # The reg field inside modrm encodes the base address of a memory operand. Default segment is ES.
'mM', # The rm field inside modrm encodes the base address of a memory operand, iregardless of the mod
# fields. Default segment is DS.
'rT', # The reg field inside modrm encodes a TMM register (AMX extension).
'mT', # The rm field inside modrm encodes a TMM register (AMX extension).
'vT', # The v field inside vex encodes a TMM register (AMX extension).
'm2zI', # Bits [1,0] of the immediate byte which selects the fourth register.
]
# Operand sizes.
valid_opsize = [
'a', # Two one-word operands in memory or two double-word operands in memory,
# depending on operand-size attribute (used only by the BOUND instruction).
'b', # Byte, regardless of operand-size attribute.
'c', # Byte or word, depending on operand-size attribute.
'd', # Doubleword, regardless of operand-size attribute.
'dq', # Double-quadword, regardless of operand-size attribute (XMM register or
# 128 bit memory location). A smaller quantity from the 128 bit register may be accessed.
'e', # eighth = word or dword or qword.
'f', # fourth = dword or qword or oword.
'h', # half = qword or oword or yword.
'n', # normal = 128, 256 or 512 bits, depending on vector length.
'u', # 256 or 512 bit, depending on vector length.
# VSIB addressing
'vm32x', # VSIB addressing, using DWORD indices in XMM register, select 32/64 bit.
'vm32y', # VSIB addressing, using DWORD indices in YMM register, select 32/64 bit.
'vm32z', # VSIB addressing, using DWORD indices in ZMM register, select 32/64 bit.
'vm32h', # VSIB addressing, using DWORD indices in half register, select 32/64 bit.
'vm32n', # VSIB addressing, using DWORD indices in normal register, select 32/64 bit.
'vm64x', # VSIB addressing, using QWORD indices in XMM register, select 32/64 bit.
'vm64y', # VSIB addressing, using QWORD indices in YMM register, select 32/64 bit.
'vm64z', # VSIB addressing, using QWORD indices in ZMM register, select 32/64 bit.
'vm64h', # VSIB addressing, using QWORD indices in half register, select 32/64 bit.
'vm64n', # VSIB addressing, using QWORD indices in normal register, select 32/64 bit.
# MIB addressing
'mib', # MIB addressing, the base & the index are used to form a pointer.
# Stack sizes and partial access
'v2', # Two stack words.
'v3', # Three stack words.
'v4', # Four stack words.
'v8', # Eight stack words.
# These are aliased over 'dq.*' encodings.
'o', # Always 128 bits/2 QWORDs. Same as 'dq'.
'oq', # 512 bit regardless the operand size/vector length.
'p', # 32, 48 or 80 bits pointer, depending on operand size.
'pd', # 128 bit or 256 bit double-precision fp data.
'ps', # 128 bit or 256 bit single-prevision fp data.
'q', # Always 1 QWORD.
'qq', # Always 4 QWORDs.
's', # 6-byte or 10-byte pseudo-descriptor.
'sd', # Scalar element of 128 bit double-precision fp data.
'ss', # Scalar element of 128 bit single-precision fp data.
'v', # WORD, DWORD or QWORD, depending on operand size.
'w', # Always WORD.
'x', # 128 bit, 256 bit, depending on operand size.
'y', # DWORD or QWORD, depending on operand size.
'yf', # Always QWORD in 64 bit mode and DWORD in 16/32 bit mode.
'z', # WORD for 16 bit op size, DWORD for 32 & 64 bit operand size.
'?', # Unknown operand size. Depends on many factors (for example, XSAVE).
'0', # Used for instructions that do not actually access any memory.
'asz', # The size of the operand is given by the current addressing mode.
'ssz', # The size of the operand is given by the current stack mode.
'fa', # FPU integer binary coded decimal.
'fw', # FPU real word.
'fd', # FPU real dword.
'fq', # FPU real qword.
'ft', # FPU real extended.
'fe', # FPU environment.
'fs', # FPU state.
'l', # Either a 64 bit or a 128 bit operand size (used by BNDMOV).
'rx', # 512 bytes extended state.
'cl', # 32/64/128 bytes - the size of one cache line.
'12', # 4 bytes (0) + 8 bytes (old SSP), used by SAVEPREVSSP.
't', # A tile register. The size varies dependning on execution environment, but can be as high as 1K.
]
# Implicit/fixed operands. Self explanatory.
valid_impops = {# register size
'AH' : ('AH', 'b'), # AH register.
'AL' : ('rAX', 'b'), # AL register.
'AX' : ('rAX', 'w'), # AX register.
'EAX' : ('rAX', 'd'), # EAX register.
'RAX' : ('rAX', 'q'), # RAX register.
'eAX' : ('rAX', 'z'), # AX or EAX register, depending on op size.
'rAX' : ('rAX', 'v'), # AX, EAX or RAX register, depending on op size.
'yAX' : ('rAX', 'y'), # EAX or RAX register, depending on op size.
'CL' : ('rCX', 'b'), # CL register.
'ECX' : ('rCX', 'd'), # ECX register.
'RCX' : ('rCX', 'q'), # RCX register.
'eCX' : ('rCX', 'z'), # CX or ECX register.
'rCX' : ('rCX', 'v'), # CX, ECX or RCX register, depending on op size.
'yCX' : ('rCX', 'y'), # ECX or RCX register, depending on op size.
'aCX' : ('rCX', 'asz'), # CX, ECX or RCX register, depedning on address size.
'DX' : ('rDX', 'w'), # DX register.
'EDX' : ('rDX', 'd'), # EDX register.
'RDX' : ('rDX', 'q'), # RDX register.
'eDX' : ('rDX', 'z'), # DX or EDX register, depending on op size.
'rDX' : ('rDX', 'v'), # DX, EDX or RDX register, depending on op size.
'yDX' : ('rDX', 'y'), # EDX or RDX register, depending on op size.
'EBX' : ('rBX', 'd'), # EBX register.
'RBX' : ('rBX', 'q'), # RBX register.
'rBX' : ('rBX', 'v'), # BX, EBX or RBX register, depending on op size.
'yBX' : ('rBX', 'y'), # EBX or RBX register, depending on op size.
'rBP' : ('rBP', 'v'), # BP, EBP or RBP register, depending on op size.
'sBP' : ('rBP', 'ssz'), # BP, EBP or RBP register, depending on stack size.
'rSP' : ('rSP', 'v'), # SP, ESP or RSP register, depending on op size.
'sSP' : ('rSP', 'ssz'), # SP, ESP or RSP register, depending on stack size.
'aSI' : ('rSI', 'asz'), # SI, ESI, or RSI register, depending on address size.
'aDI' : ('rDI', 'asz'), # DI, EDI, or RDI register, depending on address size.
'R11' : ('rR11', 'q'), # R11 register.
'rIP' : ('rIP', 'v'), # IP, EIP or RIP, depending on op size.
'yIP' : ('rIP', 'yf'), # EIP in 16/32 bit mode, or RIP in 64 bit mode.
'1' : ('1', 'b'), # Constant 1.
'XMM0' : ('XMM0', 'dq'), # XMM0 register.
'ST(0)' : ('ST(0)', 'ft'), # ST(0) register.
'ST(i)' : ('ST(i)', 'ft'), # ST(1) register.
'CS' : ('CS', 'v'), # CS register.
'SS' : ('SS', 'v'), # SS register.
'DS' : ('DS', 'v'), # DS register.
'ES' : ('ES', 'v'), # ES register.
'FS' : ('FS', 'v'), # FS register.
'GS' : ('GS', 'v'), # GS register.
'CR0' : ('CR0', 'yf'), # CR0 register.
'XCR' : ('XCR', 'q'), # An XCR register.
'XCR0' : ('XCR0', 'q'), # XCR0 register.
'MSR' : ('MSR', 'q'), # A MSR.
'TSC' : ('TSC', 'q'), # TSC register.
'TSCAUX' : ('TSCAUX', 'q'), # TSXAUX register.
'SCS' : ('SCS', 'q'), # IA32_SYSNETER_CS register.
'SEIP' : ('SEIP', 'q'), # IA32_SYSENTER_EIP register.
'SESP' : ('SESP', 'q'), # IA32_SYSENTER_ESP register.
'FSBASE' : ('FSBASE', 'q'), # IA32_FS_BASE register.
'GSBASE' : ('GSBASE', 'q'), # IA32_GS_BASE register.
'KGSBASE' : ('KGSBASE', 'q'), # IA32_KERNEL_GS_BASE register.
'STAR' : ('STAR', 'q'), # IA32_STAR register.
'LSTAR' : ('LSTAR', 'q'), # IA32_LSTAR register.
'FMASK' : ('FMASK', 'q'), # IA32_FMASK register.
'GDTR' : ('GDTR', 's'), # GDT register.
'IDTR' : ('IDTR', 's'), # IDT register.
'LDTR' : ('LDTR', 'w'), # LDT register.
'TR' : ('TR', 'w'), # Task register.
'BANK' : ('BANK', '?'), # A register bank.
'X87CONTROL':('X87CONTROL', 'w'), # X87 control register.
'X87TAG' : ('X87TAG', 'w'), # X87 tag register.
'X87STATUS': ('X87STATUS', 'w'), # X87 status register.
'MXCSR' : ('MXCSR', 'd'), # MXCSR register.
'PKRU' : ('PKRU', 'd'), # PKRU register.
'SSP' : ('SSP', 'yf'), # Shadow stack pointer. 32 bit in protected/compat mode, 64 in long mode.
# Implicit memory operands.
'pBXALb' : ('pBXAL', 'b'), # Implicit [RBX + AL], as used by XLAT.
'pDIq' : ('pDI', 'q'), # Implicit qword [RDI].
'pDIdq' : ('pDI', 'dq'), # Implicit xmmword [RDI].
# Implicit shadow stack accesses.
'SHS' : ('SHS', 'q'), # Shadow stack (SSP) implicit access, 1 qword (use by CET instructions).
'SHS0' : ('SHS0', 'q'), # Shadow stack (IA32_PL0_SSP) implicit access, 1 qword (use by CET instructions).
'SHSI' : ('SHS', 'v2'), # Shadow stack load & discard, 2 elements (INCCSPD/INCSSPQ).
'SHSS' : ('SHS', '12'), # Shadow stack read & store 4 + 8 bytes (SAVEPREVSSP).
'SHS1' : ('SHSP', 'v'), # Shadow stack push/pop, 1 word.
'SHS2' : ('SHSP', 'v2'), # Shadow stack push/pop, 2 words.
'SHS3' : ('SHSP', 'v3'), # Shadow stack push/pop, 3 words.
'SHS4' : ('SHSP', 'v4'), # Shadow stack push/pop, 4 words.
}
# If an operand type is not present here, than that operand is implicit & it's not encoded inside the instruction.
operand_encoding = {
'A' : 'D', # Immediate, encoded directly in the instruction bytes.
'B' : 'V', # VEX/EVEX.vvvv encoded general purpose register.
'C' : 'R', # Modrm.reg encoded control register.
'D' : 'R', # Modrm.reg encoded debug register.
'E' : 'M', # Modrm.rm encoded general purpose register or memory.
'G' : 'R', # Modrm.reg encoded general purpose register.
'H' : 'V', # VEX/EVEX.vvvv encoded vector register.
'I' : 'I', # Immediate, encoded directly in the instruction bytes.
'J' : 'D', # Relative offset, encoded directly in the instruction bytes.
'L' : 'L', # Register encoded in an immediate.
'M' : 'M', # Modrm.rm encoded memory.
'N' : 'M', # Modrm.rm encoded MMX register.
'O' : 'D', # Absolute memory encoded directly in the instruction.
'P' : 'R', # Modrm.reg encoded MMX register.
'Q' : 'M', # Modrm.rm encoded MMX register or memory.
'R' : 'M', # Modrm.rm encoded general purpose register.
'S' : 'R', # Modrm.reg encoded segment register.
'T' : 'R', # Modrm.reg encoded test register.
'U' : 'M', # Modrm.rm encoded vector register.
'V' : 'R', # Modrm.reg encoded vector register.
'W' : 'M', # Modrm.rm encoded vector register or memory.
'Z' : 'O', # General purpose register encoded in opcode low 3 bit.
'rB' : 'R', # Modrm.reg encoded bound register.
'mB' : 'M', # Modrm.rm encoded bound register or memory.
'rK' : 'R', # Modrm.reg encoded mask register.
'vK' : 'V', # VEX/EVEX.vvvv encoded mask register.
'mK' : 'M', # Modrm.rm encoded mask register or memory.
'aK' : 'A', # EVEX.aaa encoded mask register.
'mR' : 'R', # Modrm.reg encoded memory.
'mM' : 'M', # Modrm.rm encoded memory (always).
'1' : '1', # Constant 1.
'CL' : 'C', # CL register.
'ST(i)' : 'M', # Modrm.rm encoded FPU register.
}
valid_prefixes = [
'REP', # Rep prefix is accepted.
'REPC', # Conditional rep prefix is accepted.
'HLE', # Hardware Lock Elision accepted.
'BND', # Bound prefix accepted (MPX).
'LOCK', # Lock prefix accepted.
'BH', # Branch hints accepted.
'XACQUIRE', # Xacquire prefix accepted.
'XRELEASE', # Xrelease prefix accepted.
'HLEWOL', # HLE prefix is accepted without lock - used by MOV instructions.
'DNT', # Do Not Track prefix accepted (CET).
]
valid_access = [
'N', # No access.
'P', # Prefetch access.
'R', # Read.
'W', # Write.
'CR', # Conditional read.
'CW', # Conditional write.
'RW', # Read-Write.
'CRW', # Conditional Read-Write.
'RCW', # Read-Conditional Write.
'CRCW', # Conditional Read-Conditional Write.
]
valid_flags = [
'CF', # Carry.
'PF', # Parity.
'AF', # Auxiliary.
'ZF', # Zero.
'SF', # Sign.
'TF', # Trap.
'IF', # Interrupt.
'DF', # Direction.
'OF', # Overflow.
'IOPL', # I/O privilege level.
'NT', # Nested Task.
'RF', # Resume Flag.
'VM', # V8086 mmode.
'AC', # Alignment Check.
'VIF', # Virtual IF.
'VIP', # Virtual IP.
'ID' # CPUID ID flag.
]
valid_flag_op = [
'm', # modified.
't', # tested.
'0', # cleared.
'1', # set.
'u', # undefined.
'n', # not accessed.
]
valid_cpu_modes = [
'r0', # Ring 0.
'r1', # Ring 1.
'r2', # Ring 2.
'r3', # Ring 3.
'real', # Real mode.
'v8086', # V8086 mode.
'prot', # Protected mode.
'compat', # Compatibility mode.
'long', # Long mode.
'smm', # System Management Mode.
'sgx', # Software Guard Extensions SGX enclave.
'tsx', # Transactional Synchronization Extensions.
'vmxr', # VMX root.
'vmxn', # VMX non-root.
'vmxo', # Outside VMX operation.
]
valid_mode_groups = [
"ring",
"mode",
"vmx",
"other",
]
valid_ring_modes = [
"r0",
"r1",
"r2",
"r3",
]
valid_mode_modes = [
"real",
"smm",
"v8086",
"prot",
"compat",
"long",
]
valid_vmx_modes = [
"vmxr",
"vmxn",
"vmxo",
]
valid_other_modes = [
"sgx",
"tsx",
]
valid_mode_map = {
"ring" : valid_ring_modes,
"mode" : valid_mode_modes,
"vmx" : valid_vmx_modes,
"other" : valid_other_modes,
}
valid_decorators = [
'{K}', # Masking support.
'{z}', # Zeroing support.
'{sae}', # Surpress All Exceptions.
'{er}', # Embedded Rounding.
'|B32', # Broadcast 32.
'|B64', # Broadcast 64.
]
valid_tuples = [
'fv', # Full Vector, Load+Op (Full Vector Dword/Qword).
'hv', # Half Vector, Load+Op (Half Vector).
'fvm', # Full Vector Memory, Load/store or subDword full vector.
'hvm', # Half Vector Memory, SubQword Conversion.
'qvm', # Quarter Vector Memory, SubDword Conversion.
'ovm', # Oct Vector Memory, SubWord Conversion.
'dup', # Dup, VMOVDDUP.
'm128', # Mem 128, Shift count from memory.
't1s8', # Tuple 1 Scalar, 8 bit, 1Tuple less than Full Vector.
't1s16', # Tuple 1 Scalar, 16 bit, 1Tuple less than Full Vector.
't1s', # Tuple 1 Scalar, 32/64 bit, 1Tuple less than Full Vector.
't1f', # Tuple 1 Fixed, 1 Tuple memsize not affected by EVEX.W.
't2', # Tuple 2, Broadcast (2 elements).
't4', # Tuple 4, Broadcast (4 elements).
't8', # Tuple 8, Broadcast (8 elements).
't1_4x',
]
class InvalidEncodingException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
class ParseLineException(Exception):
def __init__(self, value):
self.value = value
def __str__(self):
return repr(self.value)
def reverse_dict(d):
r = {}
for k in d:
r[d[k]] = k
return r
def my_str(x):
if x is None:
return x
else:
return str(x)
#
# CPUID feature flags.
#
class CpuidFeatureFlag():
def __init__(self, finfo):
self.Name = finfo["name"]
self.Leaf = finfo["leaf"]
self.SubLeaf = finfo["subleaf"]
self.Reg = finfo["reg"]
self.Bit = finfo["bit"]
def __str__(self):
return "%s: %s, %s, %s, %s" % (self.Name, self.Leaf, self.SubLeaf, self.Reg, self.Bit)
#
# Operand description
#
class Operand():
def __init__(self, op, access, flags, imp = False):
self.Raw = op
self.Type = 0
self.Size = 0
self.Flags = flags
self.Decorators = []
self.Access = []
self.Block = 0
self.Encoding = 'S'
self.Implicit = imp
orig = op
# Handle block registers.
if op.endswith('+3'):
self.Block = 4
op = op.replace('+3', '')
elif op.endswith('+1'):
self.Block = 2
op = op.replace('+1', '')
# Handle the decorators.
for dec in valid_decorators:
if -1 != op.find(dec):
# Found decorator.
self.Decorators.append(dec)
# Remove it from the opstring.
op = op.replace(dec, "")
# Handle hard-coded operators - those that are implicit/are not encoded anywhere.
if op in valid_impops:
self.Type, self.Size = valid_impops[op][0], valid_impops[op][1]
# Now handle explicit operators.
else:
# Attempt a match inside the explicit operands map.
for opt in valid_optype:
if op.startswith(opt):
self.Type = opt
op = op.replace(opt, "")
break
# Now the operand size. After parsing the decorator and the operand type, we should be left with
# the operand size only.
if self.Type in ['rK', 'mK', 'vK', 'aK'] and not op in valid_opsize:
self.Size = 'q'
elif op in valid_opsize:
self.Size = op
else:
raise InvalidEncodingException('Invalid operand size specified: ' + orig)
if self.Type in operand_encoding:
self.Encoding = operand_encoding[self.Type]
elif self.Raw in operand_encoding:
self.Encoding = operand_encoding[self.Raw]
if imp and 'OPDEF' not in self.Flags:
self.Flags.append('OPDEF')
self.Access = access
def __str__(self):
if True:
return self.Raw
#
# Prefixes.
#
class Prefix():
def __init__(self, prefix):
self.Mnemonic = prefix["mnemonic"]
self.Encoding = prefix["encoding"]
def __str__(self):
return self.Mnemonic
#
# Instructions.
#
class Instruction():
def __init__(self, iinfo):
# Fill in raw instruction information
self.Mnemonic = iinfo["mnemonic"]
self.RawEnc = iinfo["encoding"]
self.Flags = iinfo["flags"]
self.Prefmap = iinfo["prefixes"]
self.Set = iinfo["set"]
self.Category = iinfo["cat"]
self.Class = iinfo["class"]
self.Rwm = iinfo["rwm"]
self.Id = iinfo["cff"] or self.Set
self.Tuple = iinfo["tuple"]
self.ExClass = iinfo["exclass"]
self.RevFlagsAccess = iinfo["flgaccess"]
self.Modes = iinfo["modes"]
self.FpuFlags = iinfo["fpuflg"]
# First redirecton class: opcodes
self.Opcodes = []
self.Prefixes = []
self.DecoFlags = []
# Second redirection class: Modrm
self.HasModrm = self.ModrmRedirAfterMpref = False
self.Mod = self.Reg = self.Rm = None
# Third redirection class: mandatory prefix.
self.Np = self.MustHave66 = self.MustHaveF2 = self.MustHaveF3 = False
# Fourth redirection class: operating mode
self.RedM16 = self.RedM32 = self.RedM64 = False
# Fifth redirection class: default operand size
self.RedDs16 = self.RedDs32 = self.RedDs64 = self.RedDDs64 = self.RedFDs64 = False
# Sixth redirection class: default address size
self.RedAs16 = self.RedAs32 = self.RedAs64 = False
# Seventh redirecton class: rex, rex.w, rep, repz
self.RedRex = self.RedRexW = self.RedRep = self.Red64 = self.RedF3 = False
# Misc - vendor
self.Vendor = None
# Misc - feature.
self.Feature = None
# XOP, VEX and EVEX classes.
self.Vex = self.Xop = self.Evex = self.Mvex = False
self.M = self.P = self.L = self.W = None
# Now parse each info chunk and extract the actual data
for t in iinfo["encoding"].split(' '):
if '0x66' == t and not self.Opcodes and not (self.Xop or self.Vex or self.Evex):
self.Prefixes.append(0x66)
self.MustHave66 = True
elif '0xF3' == t and not self.Opcodes and not (self.Xop or self.Vex or self.Evex):
self.Prefixes.append(0xF3)
self.MustHaveF3 = True
elif '0xF2' == t and not self.Opcodes and not (self.Xop or self.Vex or self.Evex):
self.Prefixes.append(0xF2)
self.MustHaveF2 = True
elif 'NP' == t:
self.Np = True
elif 'a0xF3' == t:
self.Prefixes.append(0xF3)
self.RedF3 = True
elif 'o64' == t:
self.Red64 = True
elif 'rexw' == t:
self.RedRexW = True
elif 'rex' == t:
self.RedRex = True
elif 'rep' == t:
self.RedRep = True
elif 'ds16' == t:
self.RedDs16 = True
elif 'ds32' == t:
self.RedDs32 = True
elif 'ds64' == t:
self.RedDs64 = True
elif 'dds64' == t:
self.RedDDs64 = True
elif 'fds64' == t:
self.RedFDs64 = True
elif 'as16' == t:
self.RedAs16 = True
elif 'as32' == t:
self.RedAs32 = True
elif 'as64' == t:
self.RedAs64 = True
elif t.startswith('/'):
self.HasModrm = True
self.Flags.append('MODRM')
if t.endswith(':mem'):
self.Mod = 'mem'
if t.endswith('reg'):
self.Mod = 'reg'
t = t.replace(':mem', '').replace(':reg', '')
for i in range(0, 8):
if '/%d' % i == t:
self.Reg = i
if re.match(r'0x[0-9a-fA-F]{2}', t[1:]):
mrm = int(t[1:], 16)
if 0xC0 == (mrm & 0xC0):
self.Mod = 'reg'
else:
self.Mod = 'mem'
self.Rm = mrm & 7
self.Reg = (mrm >> 3) & 7
elif 'modrm' == t:
self.HasModrm = True
self.Flags.append('MODRM')
elif t.startswith('mod:'):
self.Mod = t[4:]
if self.Mod not in ['mem', 'reg']:
raise InvalidEncodingException('Invalid encoding: illegal "mod" modifier')
elif t.startswith('reg:'):
self.Reg = t[4:]
if self.Reg not in ['0', '1', '2', '3', '4', '5', '6', '7']:
raise InvalidEncodingException('Invalid encoding: illegal "reg" value')
self.Reg = int(self.Reg)
elif t.startswith('rm:'):
self.Rm = t[3:]
if self.Rm not in ['0', '1', '2', '3', '4', '5', '6', '7']:
raise InvalidEncodingException('Invalid encoding: illegal "rm" value')
self.Rm = int(self.Rm)
elif t.startswith('modrmpmp'):
self.ModrmRedirAfterMpref = True
elif t == 'xop':
self.Xop = True
elif t == 'vex':
self.Vex = True
elif t == 'evex':
self.Evex = True
elif t == 'mvex':
self.Mvex = True
elif t.startswith('m:'):
self.M = t[2:]
if self.M not in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C']:
raise InvalidEncodingException('Invalid encoding: illegal "mmmmm" value')
self.M = int(self.M, 16)
elif t.startswith('p:'):
self.P = t[2:]
if not self.P in ['0', '1', '2', '3']:
raise InvalidEncodingException('Invalid encoding: illegal "pp" value!')
self.P = int(self.P)
elif t.startswith('l:'):
self.L = t[2:]
if self.L == '128':
self.L = 0
elif self.L == '256':
self.L = 1
elif self.L == '512':
self.L = 2
elif self.L == 'x':
self.L = None
elif self.L == 'i':
self.L = None
if 'LIG' not in self.Flags:
self.Flags.append('LIG')
elif self.L in ['0', '1', '2', '3']:
self.L = int(self.L)
else:
raise InvalidEncodingException('Invalid encoding: illegal "l" value!')
elif t.startswith('w:'):
self.W = self.RawW = t[2:]
if self.W == 'x':
self.W = None
elif self.W == 'i':
self.W = None
if 'WIG' not in self.Flags:
self.Flags.append('WIG')
elif self.W in ['0', '1']:
self.W = int(self.W)
else:
raise InvalidEncodingException('Invalid encoding: illegal "w" value!')
elif re.match(r'0x[0-9a-fA-F]{2}', t):
self.Opcodes.append(int(t, 16))
elif t in ['intel', 'amd', 'via', 'cyrix']:
self.Vendor = t
elif t in ['mpx', 'cet', 'cldm']:
self.Feature = t
elif 'vsib' == t:
self.HasVsib = True
if 'VSIB' not in self.Flags:
self.Flags.append('VSIB')
elif 'mib' == t:
self.HasMib = True
if 'MIB' not in self.Flags:
self.Flags.append('MIB')
elif 'bitbase' == t:
self.HasBitbase = True
if 'BITBASE' not in self.Flags:
self.Flags.append('BITBASE')
elif 'sibmem' == t:
self.HasSibMem = True
if 'SIBMEM' not in self.Flags:
self.Flags.append('SIBMEM')
elif t in ['ib', 'iw', 'iz', 'iv', 'id', 'cb', 'cz', 'cv', 'cp', 'is4']:
# Not used for now, but they must be specified, for a complete instruction encoding specification.
pass
elif t.startswith('evex.'):
tokens2 = t.split('.')
self.Evex = True
self.M = self.P = self.L = self.W = 0
for t2 in tokens2[1:]:
# Handle the L specifier
if t2 == 'LIG':
self.L = None
if 'LIG' not in self.Flags:
self.Flags.append('LIG')
elif t2 == 'LANY':
self.L = None
elif t2 == '128' or t2 == 'LZ' or t2 == 'L0':
self.L = 0
elif t2 == '256' or t2 == 'L1':
self.L = 1
elif t2 == '512' or t2 == 'L2':
self.L = 2
# Handle the W specifier
elif t2 == 'WIG':
self.W = None
if 'WIG' not in self.Flags:
self.Flags.append('WIG')
elif t2 == 'WANY':
self.W = None
elif t2 == 'W0':
self.W = 0
elif t2 == 'W1':
self.W = 1
# Handle compressed prefix
elif t2 == '66':
self.P = 1
elif t2 == 'F3':
self.P = 2
elif t2 == 'F2':
self.P = 3
# Handle opcode map
elif t2 == '0F':
self.M = 1
elif t2 == '0F38':
self.M = 2
elif t2 == '0F3A':
self.M = 3
elif t2 in ['NDS', 'NDD', 'DDS']:
pass
else:
raise InvalidEncodingException('Unknwon new evex token: %s/%s' % (t, t2))
else:
raise InvalidEncodingException('Unknown token: %s' % t)
# Pre-process the explicit operands. The mask register is contained as a decorator, but put it as a direct
# operand as well. The access flag is already present in rwm.
if len(iinfo["expops"]) >= 1 and iinfo["expops"][0].find("{K") > 0:
iinfo["expops"].insert(1, 'aKq')
# Parse the explicit instruction operands.
self.ExpOps = self.process_operands(iinfo["expops"], False)
# Parse the implicit instruction operands.
self.ImpOps = self.process_operands(iinfo["impops"], True)
# Post-process the operands. We fill up the flags with additional info based on the operands.
for op in self.ExpOps:
for deco in op.Decorators:
self.DecoFlags.append({'{K}':'MASK', '{z}':'ZERO', '{sae}':'SAE', '{er}':'ER', '|B32':'BROADCAST', '|B64':'BROADCAST'}[deco])
if op.Type in ['U', 'V', 'W', 'H', 'L'] and 'VECT' not in self.Flags:
self.Flags.append('VECT')
# VEX, XOP, EVEX and MVEX instructions are not valid in real or v8086 modes.
if self.Vex or self.Xop or self.Evex or self.Mvex:
if 'real' in self.Modes:
self.Modes.remove('real')
if 'v8086' in self.Modes:
self.Modes.remove('v8086')
if 'long' not in self.Modes and 'II64' not in self.Flags:
self.Flags.append('II64')
if 'long' in self.Modes and 'prot' not in self.Modes and 'O64' not in self.Flags:
self.Flags.append('O64')
# Split the instruction into encoding entities.
e = self.split_encoding()
if self.Vex or self.Xop or self.Evex:
self.Spec = { "mmmmm" : e[0], "opcodes" : e[1], "modrm" : e[2], "pp" : e[3], "l" : e[4], "w" : e[5] }
else:
self.Spec = { "opcodes" : e[0], "modrm" : e[1], "mpre" : e[2], "mode" : e[3], "dsize" : e[4], \
"asize" : e[5], "opre" : e[6], "vendor" : e[7], "feature": e[8] }
def process_operands(self, ops, imp = False):
p = 1
res = []
for op in ops:
if op == "nil":
break
flags = []
if not imp:
for f in self.Flags:
if f.startswith('OP%d' % p):
flags.append('OP' + f[3:])
self.Flags.remove(f)
else:
flags.append('OPDEF')
if not imp:
res.append(Operand(op, self.Rwm[p - 1], flags, imp))
else:
res.append(Operand(op, self.Rwm[len(self.ExpOps) + p - 1], flags, imp))
p += 1
return res
def split_encoding(self):
if self.Vex or self.Xop or self.Evex or self.Mvex:
return self.split_encoding_vex()
else:
return self.split_encoding_legacy()
def split_encoding_vex(self):
# First, get the 'mmmmm' - VEX decoding table.
mmmmm = '%x' % self.M
# Now get the opcode. Should be only one.
opcodes = ['%02x' % x for x in self.Opcodes]
# Get the modrm redirections.
modrm = { "mod": self.Mod, "reg": my_str(self.Reg), "rm": my_str(self.Rm), "modpost": None }
# Get the pp, if any.
pp = my_str(self.P)
# Get the l, if any.
l = my_str(self.L)
# Get the w, if any.
w = my_str(self.W)
return (mmmmm, opcodes, modrm, pp, l, w)
def split_encoding_legacy(self):
# First redirection class, the opcode.
opcodes = ['%02x' % x for x in self.Opcodes]
# Second redirection class, modrm
modrm = { "mod": self.Mod, "reg": my_str(self.Reg), "rm": my_str(self.Rm), "modpost": None }
# Third redirection class, mandatory prefixes
mprefixes = []
if self.MustHaveF2:
mprefixes.append('F2')
if self.MustHaveF3:
mprefixes.append('F3')
if self.MustHave66:
mprefixes.append('66')
if self.Np:
mprefixes.append('NP')
if len(mprefixes) == 0 and (not (self.Xop or self.Vex or self.Evex or self.Mvex)) and\
(self.Opcodes[0] == 0x0F and self.Opcodes[1] in [0x3A, 0x38]):
mprefixes.append(None)
# Fourth redirection class, operating mode.
mode = []
if self.RedM16:
mode.append('m16')
elif self.RedM32:
mode.append('m32')
elif self.RedM64:
mode.append('m64')
# Fifth redirection class, default operand size.
dsize = []
if self.RedDs16:
dsize.append('ds16')
elif self.RedDs32:
dsize.append('ds32')
elif self.RedDs64:
dsize.append('ds64')
elif self.RedDDs64:
dsize.append('dds64')
elif self.RedFDs64:
dsize.append('fds64')
# Sixth redirection class, default address size.
asize = []
if self.RedAs16:
asize.append('as16')
elif self.RedAs32:
asize.append('as32')
elif self.RedAs64:
asize.append('as64')
# Seventh redirection class, REX prefix, REX.W, 64 bit mode, 0xF3, SIB. The important aspect here is that unlike
# the other classes, this is not exhaustive - if an instruction does not fit in any of the entries, it
# will default to index 0 (and it will not return invalid encoding, unless entry 0 is invalid).
oprefixes = []
if self.RedRex:
oprefixes.append('rex')
if self.RedRexW:
oprefixes.append('rexw')
if self.Red64:
oprefixes.append('64')
if self.RedF3:
oprefixes.append('aF3')
if self.RedRep:
oprefixes.append('rep')
# Vendor redirection, if any.
return (opcodes, modrm, mprefixes, mode, dsize, asize, oprefixes, self.Vendor, self.Feature)
def __str__(self):
# Get the operands
ops = ''
for o in self.ExpOps:
ops += o.__str__() + ','
ops = ops[:-1]
# Return a text reprezentation of the encoding
return (self.Mnemonic + ' ' + ops).strip()
def parse_entry(entry, template_flags = {}, template_cpuid = {}, template_modes = {}):
# make sure this is not a comment. Skip comments.
if entry.startswith('#') or len(entry) < 4: return None
try:
# Preprocess: remove comments, CR/LF
com = entry.find('#')
x = entry.replace('\x0D', '').replace('\x0A', '')
if -1 != com: x = entry[:com]
# Space can't be the first character.
if x[0] == ' ':
raise ParseLineException('Space cannot be the first character!')
# Extract the mnemonic
mnemonic = x[0:x.find(' ')].strip()
# Extract the explicit operands
x = x[x.find(' '):].strip()
expops = x[:x.find(' ')].split(',')
if len(expops) == 1 and expops[0] == 'nil': expops = []
# Extract the implicit operands
x = x[x.find(' '):].strip()
impops = x[:x.find(' ')].split(',')
if len(impops) == 1 and impops[0] == 'nil': impops = []
# Extract the encoding
x = x[x.find('[')+1:]
encoding = x[:x.find(']')].strip()
# Extract the flags, class, set, category, encoding, prefmap
attributes = prefmap = isaset = category = iclass = adop = rwm = None
cff = tuple = flgaccess = modes = exclass = fpuflg = None
x = x[x.find(']')+1:].strip()
while x:
start = x.find(':')
end = x.find(',')
if start == -1:
break
if end == -1:
end = len(x)
token = x[:start].strip()
value = x[start+1:end].strip()
# parse token
if token == 'a': # Instruction attributes.
attributes = value.split('|')
elif token == 'p': # Accepted prefixes.
prefmap = value.split('|')
elif token == 's': # Instruction set
isaset = value
elif token == 't': # Instruction type
category = value
elif token == 'c': # Instruction class. Defaults to the mnemonic if not specified.
iclass = value
elif token == 'w': # Read/write map
rwm = value.split('|')
elif token == 'i': # CPUID.
cff = value
elif token == 'l': # tuple
tuple = value
elif token == 'e':
exclass = value
elif token == 'f': # Flags access
flgaccess = []
for v in value.split('|'):
if v in template_flags:
flgaccess += template_flags[v].split('|')
else:
flgaccess.append(v)
elif token == 'u':
fpuflg = ['u', 'u', 'u', 'u'] # each one is undefined.
for v in value.split('|'):
flg, acc = v.split('=')
if flg not in ['C0', 'C1', 'C2', 'C3']:
raise ParseLineException('Unknown FPU flag: %s' % flg)
if acc not in ['0', '1', 'm', 'u']:
raise ParseLineException('Unknown FPU flag access: %s' % acc)
fpuflg[int(flg[1])] = acc
elif token == 'm': # CPU modes.
# Example: m:ring=0,1,2,3|vmx=root,nonroot|mode=real,v8086,smm,prot,compat,long|other=sgx,tsx
# Note: any group that is not specified is considered entirely valid
# Note: any group that is specified overrides all the other fields in the group; example:
# mode=real - this means the instruction is valid ONLY in real mode.
# mode=!v8086 - this means the instructiom is valid is ANY mode except for V8086
tmodes = []
for t in value.split('|'):
if t in template_modes:
tmodes += template_modes[t].split('|')
else:
tmodes.append(t)
modes = []
groups = {}
for g in valid_mode_groups:
groups[g] = {}
groups[g]["negated"] = False
groups[g]["specified"] = False
groups[g]["modes"] = []
for tm in tmodes:
m, v = tm.split('=')
for vx in v.split('+'):
negated = False
if vx.startswith('!'):
vx = vx[1:]
groups[m]["negated"] = True
if m not in valid_mode_groups:
raise ParseLineException('Unknown CPU mode group specified: %s' % m)
if vx not in valid_mode_map[m]:
raise ParseLineException('Mode %s is not valid for mode group %s; it can be one of [%s]' %
(vx, m, ','.join(valid_mode_map[m])))
groups[m]["specified"] = True
groups[m]["modes"].append(vx)
for g in groups:
if not groups[g]["specified"]:
modes += valid_mode_map[g]
elif not groups[g]["negated"]:
modes += groups[g]["modes"]
else:
modes += [x for x in valid_mode_map[g] if x not in groups[g]["modes"]]
else:
raise ParseLineException('Unknown token specified: %s' % token)
# Advance
if -1 == x.find(','):
x = ''
else:
x = x[x.find(',')+1:].strip()
if attributes is None:
attributes = []
if prefmap is None:
prefmap = []
if isaset is None:
isaset = 'UNKNOWN'
if category is None:
category = 'UNKNOWN'
if iclass is None:
iclass = mnemonic
if rwm is None:
rwm = []
if cff is None:
cff = None
if modes is None:
# No mode specified, assume validity in all modes.
modes = []
modes += valid_cpu_modes
if flgaccess is None:
flgaccess = []
if fpuflg is None:
# fpuflg[x] is for Cx (fpuflg[0] = C0, fpuflg[1] = C1, etc.)
# u = undefined, m = modified, 0 = cleared to 0, 1 = set to 1.
fpuflg = ['u', 'u', 'u', 'u']
# Validate the tokens.
# The set can be anything.
# The type can be anything.
# The iclass can be missing, it will default to the mnemonic.
# The read/write map must have the same size as the number of operands.
if len(rwm) < len(expops) + len(impops):
raise ParseLineException('Invalid number of operand access specifiers: provided %d, expecting at least %d' %
(len(rwm), len(expops) + len(impops)))
for r in rwm:
if r not in valid_access:
raise ParseLineException('Unknown access specifier "%s", expecting one of [%s]' %
(r, ','.join(valid_access)))
# The CPUID can be anything, even if it doesn't match something specified in cpuid.dat.
# The modes must be one of the valid modes.
for m in modes:
if m.startswith('!'):
m = m[1:]
if m not in valid_cpu_modes:
raise ParseLineException('Unknown CPU mode specifier "%s", expecting one of [%s]' %
(m, ','.join(valid_cpu_modes)))
# Validate the prefixes.
for p in prefmap:
if p not in valid_prefixes:
raise ParseLineException('Unknown prefix specifier "%s", expecting one of [%s]' %
(p, ','.join(valid_prefixes)))
# Validate the tuples.
if tuple and tuple not in valid_tuples:
raise ParseLineException('Unknown tuple specifier "%s", expecting one of [%s]' %
(tuple, ','.join(valid_tuples)))
# Validate the attributes.
for a in attributes:
if a not in valid_attributes:
raise ParseLineException('Unknown attribute specifier "%s", expecting one of [%s]' %
(a, ','.join(valid_attributes)))
# Validate the flags.
revflg = {}
for m in valid_flag_op:
revflg[m] = []
for flg in flgaccess:
f, m = flg.split('=')
if m not in valid_flag_op:
raise ParseLineException('Unknow flag access specifier "%s", expecting one of [%s]' %
(m, ','.join(valid_flag_op)))
if f not in valid_flags:
raise ParseLineException('Unknow flag specifier "%s", expecting one of [%s]' %
(f, ','.join(valid_flas)))
revflg[m].append(f)
flgaccess = revflg
iinfo = {
"mnemonic" : mnemonic, # Mnemonic
"expops" : expops, # Explicit operands
"impops" : impops, # Implicit operands
"encoding" : encoding, # Encoding
"flags" : attributes, # Instruction attributes
"prefixes" : prefmap, # Accepted prefixes
"set" : isaset, # Instruction set
"cat" : category, # Instruction category
"class" : iclass, # Instruction class
"rwm" : rwm, # Read/write operands map
"cff" : cff, # CPUID feature flag
"tuple" : tuple, # Tuple type, for EVEX instruxtions
"exclass" : exclass, # Exception class, for SSE/VEX/EVEX instructions
"flgaccess" : flgaccess, # RFLAGS access
"modes" : modes, # Valid operating modes
"fpuflg" : fpuflg, # FPU flags access (C0, C1, C2, C3), valid for x87 instructions only
}
if 'PREFIX' in attributes:
return None
try:
ins = Instruction(iinfo)
except:
raise
except Exception as e:
raise
return ins
def parse_ins_file(fpath, template_flags = {}, template_cpuid = {}, template_modes = {}):
instructions = []
lcount = 0
for line in open(fpath, 'rt'):
lcount += 1
try:
ins = parse_entry(line, template_flags, template_cpuid, template_modes)
if ins: instructions.append(ins)
except Exception as e:
print('ERROR: Parsing failed at %s:%d: %s' % (fpath, lcount, e))
raise
return instructions
def parse_pre_file(fpath):
prefixes = []
for line in open(fpath, 'rt'):
# Ignore comments.
if line.startswith('#'):
continue
res = re.findall(r'([^\s]+)\s*\[\s*(0x[0-9a-fA-F]+)\]', line)
if not res:
continue
res = res[0]
pref = {}
pref["mnemonic"] = res[0]
pref["encoding"] = res[1]
prefixes.append(Prefix(pref))
return prefixes
def parse_cff_file(fpath):
features = []
for line in open(fpath, 'rt'):
if line.startswith('#'):
continue
res = re.findall(r'([^\s]+)\s+:\s+(0x[0-9a-fA-F]+),\s+(0x[0-9a-fA-F]+),\s+(EAX|ECX|EDX|EBX),\s+(\d+)', line)
if not res:
continue
res = res[0]
cffi = {}
cffi["name"] = res[0]
cffi["leaf"] = res[1]
cffi["subleaf"] = res[2]
cffi["reg"] = res[3]
cffi["bit"] = res[4]
features.append(CpuidFeatureFlag(cffi))
return features
def parse_flags_file(fpath):
flags = {}
for line in open(fpath, 'rt'):
if line.startswith('#'):
continue
res = re.findall(r'([^\s]+)\s+:([^$]+)', line)
if not res:
continue
res = res[0]
flags[res[0]] = res[1].strip('\n\r ')
return flags
def parse_modess_file(fpath):
modes = {}
for line in open(fpath, 'rt'):
if line.startswith('#'):
continue
res = re.findall(r'([^\s]+)\s+:([^$]+)', line)
if not res:
continue
res = res[0]
modes[res[0]] = res[1].strip('\n\r ')
return modes
#
# =============================================================================
# Main
# =============================================================================
#
if __name__ == "__main__":
if len(sys.argv) < 2:
print('Usage: %s defs-file' % os.path.basename(sys.argv[0]))
sys.exit(-1)
# Parse the flags file.
flags = parse_flags_file('%s/flags.dat' % sys.argv[1])
# Parse the prefixes
prefixes = parse_pre_file('%s/prefixes.dat' % sys.argv[1])
# Parse the cpuid feature flags and extract each feature
features = parse_cff_file('%s/cpuid.dat' % sys.argv[1])
# Parse the modes file.
modes = parse_modess_file('%s/modes.dat' % sys.argv[1])
# Parse the instruction file and extract the instructions
instructions = []
for fn in glob.glob('%s/table*.dat' % sys.argv[1]):
instructions += parse_ins_file(fn, flags, features)
# Sort the instructions.
instructions = sorted(instructions, key = lambda x: x.Mnemonic)
for i in range(0, len(instructions)):
print(instructions[i])
for i in range(0, len(prefixes)):
print(prefixes[i])
features = sorted(features, key = lambda x: x.Name)
for i in range(0, len(features)):
print(features[i])