mirror of
https://github.com/bitdefender/bddisasm.git
synced 2025-01-18 11:00:57 +00:00
965 lines
32 KiB
Python
965 lines
32 KiB
Python
#!/usr/bin/env python3
|
|
#
|
|
# Copyright (c) 2024 Bitdefender
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
import os
|
|
import sys
|
|
import re
|
|
import copy
|
|
import glob
|
|
import disasmlib
|
|
|
|
header = '''/*
|
|
* Copyright (c) 2024 Bitdefender
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
//
|
|
// This file was auto-generated by generate_tables.py. DO NOT MODIFY!
|
|
//
|
|
|
|
'''
|
|
|
|
# Set this to True to generate the instructions database with designated initializers format:
|
|
# ".FieldName = Value"
|
|
# Set this to False to generate the instructions database with aggregate initializers format:
|
|
# "/* FieldName */ Value"
|
|
# Designated initializers can be used when building as C code (recommended!) or C++ >= 20.
|
|
# Aggregate initializers must be used when building as C++ code (NOT recommended).
|
|
idbe_format_designated = True
|
|
|
|
|
|
#
|
|
# These are the encoding components used to group instructions. Important things to note:
|
|
# - The order here is important! The higher up (the lower the index) for a given component, the more priority it
|
|
# receives when grouping instructions
|
|
# - The first entry of each tuple is the component type/name
|
|
# - The second is a boolean indicating whether the component is mandatory for all or optional
|
|
#
|
|
# For example, "opcode" is mandatory for all instructions in a group, as we can't group them otherwise. However,
|
|
# the auxiliary entry is not mandatory - in a given group, some instructions may have such an entry, while other do
|
|
# not. Those that lack an optional component will be treated as "default" encoding.
|
|
#
|
|
# Example of optional component:
|
|
# 0x90 : NOP
|
|
# repz 0x90 : PAUSE
|
|
# In this case, the `repz` is an optional encoding component, meaning that its presence will cause PAUSE
|
|
# to be decoded, but its absence will cause NOP to be decoded.
|
|
#
|
|
# Example of mandatory component:
|
|
# 0x0F 0x00 /0:mem: SLDT Mw
|
|
# 0x0F 0x00 /0:reg: SLDT Rv
|
|
# In this case, all instructions must specify both the reg component (/0) and the mod (mem or reg).
|
|
# If any of them is present for one instruction but absent for another, will lead to an error. For example, the
|
|
# following spec will cause an error, since there's no grouping possible, because one of the instructions specifies
|
|
# mem mode, while the other does not specify anything:
|
|
# 0x0F 0x00 /0:mem : SLDT Mw
|
|
# 0x0F 0x00 /0 : SLDT Rv
|
|
#
|
|
components_legacy = [
|
|
{ 'type': 'opcode' , 'all': True },
|
|
{ 'type': 'opcode_last' , 'all': True },
|
|
{ 'type': 'vendor' , 'all': False },
|
|
{ 'type': 'feature' , 'all': False },
|
|
{ 'type': 'prefix' , 'all': True },
|
|
{ 'type': 'modrmreg' , 'all': True },
|
|
{ 'type': 'modrmmod' , 'all': True },
|
|
{ 'type': 'modrmrm' , 'all': True },
|
|
{ 'type': 'mode' , 'all': False },
|
|
{ 'type': 'dsize' , 'all': False },
|
|
{ 'type': 'asize' , 'all': False },
|
|
{ 'type': 'auxiliary' , 'all': False },
|
|
{ 'type': 'w' , 'all': True },
|
|
]
|
|
|
|
components_ex = [
|
|
{ 'type': 'mmmmm' , 'all': True },
|
|
{ 'type': 'opcode' , 'all': True },
|
|
{ 'type': 'pp' , 'all': True },
|
|
{ 'type': 'modrmreg' , 'all': True },
|
|
{ 'type': 'modrmmod' , 'all': True },
|
|
{ 'type': 'modrmrm' , 'all': True },
|
|
{ 'type': 'l' , 'all': True },
|
|
{ 'type': 'w' , 'all': True },
|
|
{ 'type': 'wi' , 'all': True },
|
|
{ 'type': 'nd' , 'all': True },
|
|
{ 'type': 'nf' , 'all': True },
|
|
{ 'type': 'sc' , 'all': True },
|
|
]
|
|
|
|
|
|
component_value_index = {
|
|
None : 0,
|
|
'None' : 0,
|
|
|
|
# modrm.mod
|
|
'mem' : 0,
|
|
'reg' : 1,
|
|
|
|
# mandatory prefixes; using 'P' prefix so they're not confused with an opcode
|
|
'PNP' : 0,
|
|
'P0x66' : 1,
|
|
'P0xF3' : 2,
|
|
'P0xF2' : 3,
|
|
|
|
# other prefixes/redirection conditions
|
|
'rexb' : 1,
|
|
'rexw' : 2,
|
|
'mo64' : 3,
|
|
'repz' : 4,
|
|
'rep' : 5,
|
|
'riprel': 6,
|
|
'rex2' : 7,
|
|
'rex2w' : 8,
|
|
|
|
# Mode
|
|
'm16' : 1,
|
|
'm32' : 2,
|
|
'm64' : 3,
|
|
|
|
# Default data size
|
|
'ds16' : 1,
|
|
'ds32' : 2,
|
|
'ds64' : 3,
|
|
'dds64' : 4,
|
|
'fds64' : 5,
|
|
|
|
# Default address size
|
|
'as16' : 1,
|
|
'as32' : 2,
|
|
'as64' : 3,
|
|
|
|
# Vendor redirection.
|
|
'any' : 0,
|
|
'intel' : 1,
|
|
'amd' : 2,
|
|
|
|
# Feature redirection.
|
|
'mpx' : 1,
|
|
'cet' : 2,
|
|
'cldm' : 3,
|
|
'piti' : 4,
|
|
}
|
|
|
|
|
|
#
|
|
# This dictionary describes how the decoding tables look. Each decoding component has associated a C decoding table.
|
|
#
|
|
components_ilut = {
|
|
'opcode' : { 'ilut': 'ND_ILUT_OPCODE', 'size': 256, 'type': 'ND_TABLE_OPCODE' },
|
|
'opcode_last' : { 'ilut': 'ND_ILUT_OPCODE_LAST', 'size': 256, 'type': 'ND_TABLE_OPCODE' },
|
|
'modrmmod' : { 'ilut': 'ND_ILUT_MODRM_MOD', 'size': 2, 'type': 'ND_TABLE_MODRM_MOD' },
|
|
'modrmreg' : { 'ilut': 'ND_ILUT_MODRM_REG', 'size': 8, 'type': 'ND_TABLE_MODRM_REG' },
|
|
'modrmrm' : { 'ilut': 'ND_ILUT_MODRM_RM', 'size': 8, 'type': 'ND_TABLE_MODRM_RM' },
|
|
'prefix' : { 'ilut': 'ND_ILUT_MAN_PREFIX', 'size': 4, 'type': 'ND_TABLE_MPREFIX' },
|
|
'mode' : { 'ilut': 'ND_ILUT_MODE', 'size': 4, 'type': 'ND_TABLE_MODE' },
|
|
'dsize' : { 'ilut': 'ND_ILUT_DSIZE', 'size': 6, 'type': 'ND_TABLE_DSIZE' },
|
|
'asize' : { 'ilut': 'ND_ILUT_ASIZE', 'size': 4, 'type': 'ND_TABLE_ASIZE' },
|
|
'auxiliary' : { 'ilut': 'ND_ILUT_AUXILIARY', 'size': 10, 'type': 'ND_TABLE_AUXILIARY' },
|
|
'vendor' : { 'ilut': 'ND_ILUT_VENDOR', 'size': 6, 'type': 'ND_TABLE_VENDOR' },
|
|
'feature' : { 'ilut': 'ND_ILUT_FEATURE', 'size': 8, 'type': 'ND_TABLE_FEATURE' },
|
|
'mmmmm' : { 'ilut': 'ND_ILUT_EX_M', 'size': 32, 'type': 'ND_TABLE_EX_M' },
|
|
'pp' : { 'ilut': 'ND_ILUT_EX_PP', 'size': 4, 'type': 'ND_TABLE_EX_PP' },
|
|
'l' : { 'ilut': 'ND_ILUT_EX_L', 'size': 4, 'type': 'ND_TABLE_EX_L' },
|
|
'w' : { 'ilut': 'ND_ILUT_EX_W', 'size': 2, 'type': 'ND_TABLE_EX_W' },
|
|
'wi' : { 'ilut': 'ND_ILUT_EX_WI', 'size': 2, 'type': 'ND_TABLE_EX_W' },
|
|
'nd' : { 'ilut': 'ND_ILUT_EX_ND', 'size': 2, 'type': 'ND_TABLE_EX_ND' },
|
|
'nf' : { 'ilut': 'ND_ILUT_EX_NF', 'size': 2, 'type': 'ND_TABLE_EX_NF' },
|
|
'sc' : { 'ilut': 'ND_ILUT_EX_SC', 'size': 16, 'type': 'ND_TABLE_EX_SC' },
|
|
}
|
|
|
|
|
|
mnemonics = []
|
|
mnemonics_prefix = []
|
|
|
|
instructions = []
|
|
prefixes = []
|
|
features = []
|
|
|
|
|
|
def instrux_to_idbe(
|
|
ins: disasmlib.Instruction
|
|
) -> dict:
|
|
"""
|
|
Generates a dictionary equivalent to the ND_IDBE structure. Each dictionary key is equivalent
|
|
to a ND_IDBE structure field. Restrictions:
|
|
- The order of the keys must be identical to the order of fields inside ND_IDBE
|
|
- There must be no gaps; if a field is not used, just initialize it to some default value
|
|
- The names must be identical to field names inside ND_IDBE
|
|
|
|
Parameters
|
|
----------
|
|
ins: disasmlib.Instruction
|
|
The instruction to be converted to a dictionary.
|
|
|
|
Returns
|
|
-------
|
|
A dictionary representing the bddisasm C definition of this instrux.
|
|
"""
|
|
d = {}
|
|
|
|
# Instruction class
|
|
d['Instruction'] = 'ND_INS_' + ins.Class
|
|
|
|
# Instruction Category
|
|
d['Category'] = 'ND_CAT_' + ins.Category
|
|
|
|
# ISA Set
|
|
d['IsaSet'] = 'ND_SET_' + ins.Set
|
|
|
|
# Mnemonic (index)
|
|
d['Mnemonic'] = '%d' % (mnemonics.index(ins.Mnemonic))
|
|
|
|
# Accepted prefixes map
|
|
if ins.Prefmap:
|
|
d['ValidPrefixes'] = '|'.join(['ND_PREF_' + x.upper() for x in ins.Prefmap])
|
|
else:
|
|
d['ValidPrefixes'] = '0'
|
|
|
|
# Valid modes
|
|
all = True
|
|
smodes = ''
|
|
for m in disasmlib.valid_cpu_modes:
|
|
if m not in ins.Modes:
|
|
all = False
|
|
if all:
|
|
smodes = 'ND_MOD_ANY'
|
|
else:
|
|
smodes = '|'.join(['ND_MOD_' + m.upper() for m in ins.Modes])
|
|
d['ValidModes'] = smodes
|
|
|
|
# Valid decorators
|
|
if ins.DecoFlags:
|
|
d['ValidDecorators'] = '|'.join(['ND_DECO_' + x.upper() for x in ins.DecoFlags])
|
|
else:
|
|
d['ValidDecorators'] = '0'
|
|
|
|
# Operand count
|
|
d['OpsCount'] = 'ND_OPS_CNT(%d, %d)' % (len(ins.ExpOps), len(ins.ImpOps))
|
|
|
|
# EVEX tuple type
|
|
if ins.Evex and ins.Tuple:
|
|
d['TupleType'] = 'ND_TUPLE_' + ins.Tuple.upper()
|
|
else:
|
|
d['TupleType'] = '0'
|
|
|
|
# Exception type
|
|
if ins.ExType:
|
|
d['ExcType'] = 'ND_EXT_' + ins.ExType
|
|
else:
|
|
d['ExcType'] = '0'
|
|
|
|
# FpuFlags (x87 instructions only)
|
|
if ins.Set == 'X87':
|
|
value = 0
|
|
acc = { '0': 0, '1': 1, 'm': 2, 'u': 3 }
|
|
for i in range(0, 4):
|
|
value |= acc[ins.FpuFlags[i]] << (i * 2)
|
|
d['FpuFlags'] = '0x%02x' % value
|
|
else:
|
|
d['FpuFlags'] = '0'
|
|
|
|
# EVEX mode
|
|
if ins.EvexMode:
|
|
d['EvexMode'] = 'ND_EVEXM_' + ins.EvexMode.upper()
|
|
else:
|
|
d['EvexMode'] = '0'
|
|
|
|
# Flags (tested, modified, set, cleared)
|
|
for m in ['t', 'm', '1', '0']:
|
|
flg = '0'
|
|
dst = ins.Rflags[m]
|
|
if m == '1' or m == '0':
|
|
dst = dst + ins.Rflags['u']
|
|
for f in dst:
|
|
flg += '|NDR_RFLAG_%s' % f.upper()
|
|
if m == 't': d['TestedFlags'] = flg
|
|
if m == 'm': d['ModifiedFlags'] = flg
|
|
if m == '1': d['SetFlags'] = flg
|
|
if m == '0': d['ClearedFlags'] = flg
|
|
|
|
# Instruction attributes
|
|
fs = '|'.join(['ND_FLAG_' + x.upper() for x in ins.Attributes
|
|
if x != 'nil' and not x.startswith('OP1') and not x.startswith('OP2')
|
|
and not x.startswith('OP3') and not x.startswith('OP4')
|
|
and not x.startswith('OP5') and not x.startswith('OP6')
|
|
]) or '0'
|
|
d['Attributes'] = fs
|
|
|
|
# CPUID flag
|
|
flg = '0'
|
|
for feat in features:
|
|
if feat.Name == ins.Id:
|
|
flg = 'ND_CFF_%s' % feat.Name
|
|
d['CpuidFlag'] = flg
|
|
|
|
# List of instruction operands
|
|
d['Operands'] = []
|
|
for op in ins.ExpOps + ins.ImpOps:
|
|
d['Operands'].append(cdef_operand(op))
|
|
|
|
return d
|
|
|
|
def cdef_operand(
|
|
op: disasmlib.Operand
|
|
) -> str:
|
|
"""
|
|
Generates a bddisasm C definition for the current operand.
|
|
|
|
Parameters
|
|
----------
|
|
op: Operand
|
|
The operand to be converted in a C definition.
|
|
|
|
Returns
|
|
-------
|
|
A string representing the bddisasm C definition of this operand.
|
|
|
|
Example
|
|
-------
|
|
"OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0)"
|
|
"""
|
|
return 'OP(%s, %s, %s, %s, %s, %d)' % (
|
|
'ND_OPT_' + op.Type,
|
|
'ND_OPS_' + (op.Size if op.Size != '?' else 'unknown'),
|
|
'|'.join(['ND_OPF_' + x for x in op.Flags]) or '0',
|
|
'ND_OPA_' + op.Access,
|
|
'|'.join(['ND_OPD_' + disasmlib.deco_op_flags[x] for x in op.Decorators]) or 0,
|
|
op.Block)
|
|
|
|
def cdef_instruction(
|
|
ins: disasmlib.Instruction
|
|
) -> str:
|
|
"""
|
|
Generates a bddisasm C or CPP definition for the current instruction.
|
|
If C style definition is used, designated initializers are used.
|
|
If CPP definition is required, aggregate initialization is used.
|
|
|
|
Parameters
|
|
----------
|
|
ins: Instruction
|
|
The instruction to be converted in a C structure.
|
|
|
|
Returns
|
|
-------
|
|
A multi-line string representing the bddisasm C or CPP definition of this instruction.
|
|
|
|
Example
|
|
-------
|
|
Designated initializer definition:
|
|
// Pos:3 Instruction:"AADD My,Gy" Encoding:"NP 0x0F 0x38 0xFC /r:mem"/"MR"
|
|
{
|
|
.Instruction = ND_INS_AADD,
|
|
.Category = ND_CAT_RAOINT,
|
|
.IsaSet = ND_SET_RAOINT,
|
|
.Mnemonic = 2,
|
|
.ValidPrefixes = 0,
|
|
.ValidModes = ND_MOD_ANY,
|
|
.ValidDecorators = 0,
|
|
.OpsCount = ND_OPS_CNT(2, 0),
|
|
.TupleType = 0,
|
|
.ExcType = 0,
|
|
.FpuFlags = 0,
|
|
.EvexMode = 0,
|
|
.TestedFlags = 0,
|
|
.ModifiedFlags = 0,
|
|
.SetFlags = 0,
|
|
.ClearedFlags = 0,
|
|
.Attributes = ND_FLAG_NOREX2|ND_FLAG_MODRM,
|
|
.CpuidFlag = ND_CFF_RAOINT,
|
|
.Operands =
|
|
{
|
|
OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0),
|
|
OP(ND_OPT_G, ND_OPS_y, 0, ND_OPA_R, 0, 0),
|
|
},
|
|
},
|
|
|
|
Aggregate initializer definition:
|
|
// Pos:3 Instruction:"AADD My,Gy" Encoding:"NP 0x0F 0x38 0xFC /r:mem"/"MR"
|
|
{
|
|
/* Instruction */ ND_INS_AADD,
|
|
/* Category */ ND_CAT_RAOINT,
|
|
/* IsaSet */ ND_SET_RAOINT,
|
|
/* Mnemonic */ 2,
|
|
/* ValidPrefixes */ 0,
|
|
/* ValidModes */ ND_MOD_ANY,
|
|
/* ValidDecorators */ 0,
|
|
/* OpsCount */ ND_OPS_CNT(2, 0),
|
|
/* TupleType */ 0,
|
|
/* ExcType */ 0,
|
|
/* FpuFlags */ 0,
|
|
/* EvexMode */ 0,
|
|
/* TestedFlags */ 0,
|
|
/* ModifiedFlags */ 0,
|
|
/* SetFlags */ 0,
|
|
/* ClearedFlags */ 0,
|
|
/* Attributes */ ND_FLAG_NOREX2|ND_FLAG_MODRM,
|
|
/* CpuidFlag */ ND_CFF_RAOINT,
|
|
/* Operands */
|
|
{
|
|
OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0),
|
|
OP(ND_OPT_G, ND_OPS_y, 0, ND_OPA_R, 0, 0),
|
|
},
|
|
},
|
|
"""
|
|
idbe = instrux_to_idbe(ins)
|
|
|
|
c = ''
|
|
|
|
# Start with the position and encoding description.
|
|
c += ' // Pos:%d Instruction:"%s" Encoding:"%s"/"%s"\n' % (
|
|
ins.Icount,
|
|
str(ins),
|
|
ins.RawEnc,
|
|
''.join([x.Encoding for x in ins.ExpOps]).replace('S', ''))
|
|
|
|
c += ' {\n'
|
|
|
|
for field in idbe:
|
|
if idbe_format_designated:
|
|
c += ' .%s = ' % field
|
|
else:
|
|
c += ' /* %16s */ ' % field
|
|
|
|
if type(idbe[field]) is list:
|
|
c += '\n'
|
|
c += ' {\n'
|
|
if len(idbe[field]) == 0:
|
|
c += ' 0\n'
|
|
else:
|
|
for entry in idbe[field]:
|
|
c += ' ' + entry + ',\n'
|
|
c += ' },\n'
|
|
else:
|
|
c += str(idbe[field]) + ',\n'
|
|
|
|
c += ' }'
|
|
|
|
return c
|
|
|
|
|
|
def compute_index(
|
|
value: str
|
|
) -> int:
|
|
"""
|
|
Given a component value, convert it to an index inside te C decoding table. Values which are present inside the
|
|
component_value_index dict will be translated using that. All other values will be considered to be hex values,
|
|
so they will be int(value, 16). The returned index is used when decoding an instruction, in order to lookup the
|
|
next viable entry in the multi-way decode tree.
|
|
|
|
Parameters
|
|
----------
|
|
value: str
|
|
The index to be converted to an integer index.
|
|
|
|
Returns
|
|
-------
|
|
An integer representing the index of the given decode component.
|
|
|
|
Example
|
|
-------
|
|
Input:
|
|
value: repz
|
|
Returns:
|
|
4
|
|
|
|
Input:
|
|
value: 0xCC
|
|
Returns:
|
|
204
|
|
"""
|
|
if value in component_value_index:
|
|
return component_value_index[value]
|
|
return int(value, 16)
|
|
|
|
|
|
def group_find_component(
|
|
instructions: list[disasmlib.Instruction],
|
|
components: list[dict]
|
|
) -> dict:
|
|
"""
|
|
Given a list of instructions and a list of decoding components, return the decoding component that covers all the
|
|
instructions in the list. For example, in an initial call to this function for the list of all legacy instructions
|
|
the "opcode" component would be returned.
|
|
|
|
Parameters
|
|
----------
|
|
instructions: list[disasmlib.Instruction]
|
|
The list of instructions to be grouped.
|
|
components:
|
|
The list of components used for grouping.
|
|
|
|
Returns
|
|
-------
|
|
A dict representing the decode component that can be used to cover all instructions in the list.
|
|
None if no such component could be found.
|
|
"""
|
|
for c in components:
|
|
if c['all']:
|
|
# Some components must be present for all instructions in the list - for example, opcode.
|
|
bad = False
|
|
for i in instructions:
|
|
if not i.Encoding[c['type']]:
|
|
bad = True
|
|
break
|
|
else:
|
|
# Optional components need only be present for a single instruction in the list.
|
|
bad = True
|
|
for i in instructions:
|
|
if i.Encoding[c['type']]:
|
|
bad = False
|
|
break
|
|
if bad:
|
|
continue
|
|
return c
|
|
return None
|
|
|
|
|
|
def group_instructions(
|
|
instructions: list[disasmlib.Instruction],
|
|
components: list[dict]
|
|
) -> dict:
|
|
"""
|
|
Given a list of instructions and a list of decoding components, find the best grouping component, distribute all
|
|
instructions inside an array of children entries based on the identified grouping component, and recurse for all
|
|
children entries, until we are left with leaf entries only. A leaf entry is composed of a single Instruction
|
|
object.
|
|
|
|
Parameters
|
|
----------
|
|
instructions: list[disasmlib.Instruction]
|
|
The list of instructions to be grouped.
|
|
components:
|
|
The list of components used for grouping.
|
|
|
|
Returns
|
|
-------
|
|
A dictionary containing two keys:
|
|
- "component": indicates the component type used for the current grouping. It is one of "components_legacy"
|
|
or "components_ex", depending on which was used for the grouping.
|
|
- "children": an array of N entries, where each entry of the array contains an array of instructions that can
|
|
be further grouped. The size N of the "children" array is given by the number of possible entries for the
|
|
given "component". For example, an "opcode" component can have up to 256 values, so "children" will have
|
|
256 entries. A "modrmreg" component can have up to 8 values, so "children" will have 8 entries.
|
|
|
|
Example
|
|
-------
|
|
Consider the following list of (simplified) initial instructions (only opcode and reduced encoding shown):
|
|
[{"I1", "0xBD"}, {"I2", "0xCC"}, {"I4", "FF /1"}, {"I5", "FF /5"}]
|
|
|
|
During the first call, "opcode" would be chosen to group the instructions, so we would end up with the following
|
|
result:
|
|
{
|
|
"component": "opcode",
|
|
"children": [
|
|
...
|
|
Pos 0xBD: [{"I1", "0xBD"}],
|
|
...
|
|
Pos 0xCC: [{"I2", "0xCC"}],
|
|
...
|
|
Pos 0xFF: [{"I4", "FF /1"}, {"I5", "FF /5"}]
|
|
]
|
|
}
|
|
|
|
We would then recurse for each child in the children array. Note that for opcodes 0xBD and 0xCC, we already have
|
|
leaf entries, so further grouping will not be required.
|
|
|
|
For opcode 0xFF, further grouping is needed. At the next step, the "modrmreg" will be chosen for grouping, with
|
|
the following result:
|
|
{
|
|
"component": "modrmreg",
|
|
"children": [
|
|
Pos 0: []
|
|
Pos 1: [{"I4", "FF /1"}]
|
|
Pos 2: []
|
|
Pos 3: []
|
|
Pos 4: []
|
|
Pos 5: [{"I5", "FF /5"}]
|
|
Pos 6: []
|
|
Pos 7: []
|
|
]
|
|
}
|
|
|
|
As in the previous example, we would recurse for each child, but we are already at leaf entries, so no more grouping
|
|
is required.
|
|
"""
|
|
group = {
|
|
'component' : None, # Component type, used to decode children instructions
|
|
'children' : None, # Array of sub-groups. Each entry is an array of instructions that will be further groupes.
|
|
}
|
|
|
|
# Find a good grouping component for the current instruction list.
|
|
comp = group_find_component(instructions, components)
|
|
|
|
# If no good component was found, we probably reached a leaf entry.
|
|
if not comp and len(instructions) == 1:
|
|
# Reached leaf entry, no more grouping needed.
|
|
group['component'] = 'leaf'
|
|
group['children'] = instructions[0]
|
|
return group
|
|
elif not comp:
|
|
# No grouping component found for multiple instructions - error.
|
|
print("ERROR: Cannot properly group the following instructions. Please review specs!")
|
|
for i in instructions: print(" -> ", i, " with encoding: ", i.RawEnc)
|
|
raise Exception("Grouping error: invalid/incomplete specification!")
|
|
|
|
# Allocate the sub-group array, based on the number of entries in the current group.
|
|
group['component'] = comp['type']
|
|
group['children'] = []
|
|
glen = components_ilut[comp['type']]['size']
|
|
|
|
for i in range(0, glen):
|
|
group['children'].append([])
|
|
|
|
# Now go through every instruction in the current group, and distribute it on its position.
|
|
# Note that at each grouping step, we pop the used component from the instruction
|
|
# encoding array, so that it's not used again.
|
|
for i in instructions:
|
|
if len(i.Encoding[comp['type']]) > 0:
|
|
index = compute_index(i.Encoding[comp['type']].pop(0))
|
|
else:
|
|
index = 0
|
|
group['children'][index].append(i)
|
|
|
|
# Now recurse, and group every sub-group of instructions.
|
|
for i in range(0, glen):
|
|
# Skip empty groups.
|
|
if not group['children'][i]:
|
|
continue
|
|
|
|
# Recursively group instructions.
|
|
group['children'][i] = group_instructions(group['children'][i], components)
|
|
|
|
return group
|
|
|
|
|
|
def group_dump(
|
|
group: map,
|
|
level: int = 0
|
|
):
|
|
"""
|
|
Dump the entire translation tree identified by the root "group".
|
|
"""
|
|
if group['component'] == 'leaf':
|
|
print(" " * level, group['children'])
|
|
return
|
|
for i in range(0, len(group['children'])):
|
|
if not group['children'][i]:
|
|
continue
|
|
print(" " * level, group['component'], '%02x' % i)
|
|
group_dump(group['children'][i], level + 1)
|
|
|
|
|
|
def dump_translation_tables(
|
|
instructions: list[disasmlib.Instruction]
|
|
):
|
|
"""
|
|
Generate the instruction translation trees.
|
|
"""
|
|
table_legacy = []
|
|
table_xop = []
|
|
table_vex = []
|
|
table_evex = []
|
|
|
|
group_legacy = {}
|
|
group_vex = {}
|
|
group_xop = {}
|
|
group_evex = {}
|
|
|
|
# Distribute each instruction type into its own table.
|
|
for i in instructions:
|
|
if i.Vex: table_vex.append(i)
|
|
elif i.Xop: table_xop.append(i)
|
|
elif i.Evex: table_evex.append(i)
|
|
else: table_legacy.append(i)
|
|
|
|
#
|
|
# Legacy map.
|
|
#
|
|
group_legacy = group_instructions(table_legacy, components_legacy)
|
|
group_cdef = group_generate_c_table(group_legacy, 'gLegacyMap_%s' % group_legacy['component'])
|
|
|
|
print('Writing the bdx86_table_root.h file...')
|
|
with open(r'../bddisasm/include/bdx86_table_root.h', 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_TABLE_ROOT_H\n')
|
|
f.write('#define BDX86_TABLE_ROOT_H\n\n')
|
|
f.write(group_cdef)
|
|
f.write('\n#endif\n\n')
|
|
|
|
|
|
#
|
|
# VEX map.
|
|
#
|
|
group_vex = group_instructions(table_vex, components_ex)
|
|
group_cdef = group_generate_c_table(group_vex, 'gVexMap_%s' % group_vex['component'])
|
|
|
|
print('Writing the bdx86_table_vex.h file...')
|
|
with open(r'../bddisasm/include/bdx86_table_vex.h', 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_TABLE_VEX_H\n')
|
|
f.write('#define BDX86_TABLE_VEX_H\n\n')
|
|
f.write(group_cdef)
|
|
f.write('\n#endif\n\n')
|
|
|
|
|
|
#
|
|
# XOP map.
|
|
#
|
|
group_xop = group_instructions(table_xop, components_ex)
|
|
group_cdef = group_generate_c_table(group_xop, 'gXopMap_%s' % group_xop['component'])
|
|
|
|
print('Writing the bdx86_table_xop.h file...')
|
|
with open(r'../bddisasm/include/bdx86_table_xop.h', 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_TABLE_XOP_H\n')
|
|
f.write('#define BDX86_TABLE_XOP_H\n\n')
|
|
f.write(group_cdef)
|
|
f.write('\n#endif\n\n')
|
|
|
|
|
|
#
|
|
# EVEX map.
|
|
#
|
|
group_evex = group_instructions(table_evex, components_ex)
|
|
group_cdef = group_generate_c_table(group_evex, 'gEvexMap_%s' % group_evex['component'])
|
|
|
|
print('Writing the bdx86_table_evex.h file...')
|
|
with open(r'../bddisasm/include/bdx86_table_evex.h', 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_TABLE_EVEX_H\n')
|
|
f.write('#define BDX86_TABLE_EVEX_H\n\n')
|
|
f.write(group_cdef)
|
|
f.write('\n#endif\n\n')
|
|
|
|
|
|
def group_generate_c_table(
|
|
group: map,
|
|
name: str
|
|
) -> str:
|
|
"""
|
|
Generate the translation tree, in C format, for the decoding tree identified by group.
|
|
"""
|
|
if group['component'] != 'leaf':
|
|
current_table = ''
|
|
current_table += 'const %s %s = \n' % (components_ilut[group['component']]['type'], name)
|
|
current_table += '{\n'
|
|
current_table += ' %s,\n' % components_ilut[group['component']]['ilut']
|
|
current_table += ' {\n'
|
|
for i in range(0, len(group['children'])):
|
|
if not group['children'][i]:
|
|
current_table += ' /* %02x */ (const void *)ND_NULL,\n' % (i)
|
|
else:
|
|
current_name = name + ('_%02x_%s' % (i, group['children'][i]['component']))
|
|
current_table += ' /* %02x */ (const void *)&%s,\n' % (i, current_name)
|
|
current_table = group_generate_c_table(group['children'][i], current_name) + current_table
|
|
current_table += ' }\n'
|
|
current_table += '};\n\n'
|
|
return current_table
|
|
else:
|
|
# Instruction, construct a dummy table that directly points to the instruction.
|
|
res = 'const ND_TABLE_INSTRUCTION %s = \n' % name
|
|
res += '{\n'
|
|
res += ' ND_ILUT_INSTRUCTION,\n'
|
|
res += ' (const void *)&gInstructions[% 4d] // %s\n' % (group['children'].Icount, str(group['children']))
|
|
res += '};\n\n'
|
|
return res
|
|
|
|
|
|
def generate_mnemonics(instructions):
|
|
mnemonics = []
|
|
|
|
for i in instructions:
|
|
mnemonics.append(i.Mnemonic)
|
|
|
|
return sorted(set(mnemonics))
|
|
|
|
def generate_constants(lst, pre = False):
|
|
constants = []
|
|
|
|
for i in lst:
|
|
if pre:
|
|
constants.append('ND_PRE_' + i.Mnemonic)
|
|
else:
|
|
constants.append('ND_INS_' + i.Class)
|
|
|
|
return sorted(set(constants))
|
|
|
|
def generate_constants2(instructions):
|
|
constants_sets, constants_types = [], []
|
|
|
|
for i in instructions:
|
|
constants_sets.append('ND_SET_' + i.Set)
|
|
constants_types.append('ND_CAT_' + i.Category)
|
|
|
|
return sorted(set(constants_sets)), sorted(set(constants_types))
|
|
|
|
def dump_mnemonics(mnemonics, prefixes, fname):
|
|
with open(fname, 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_MNEMONICS_H\n')
|
|
f.write('#define BDX86_MNEMONICS_H\n')
|
|
f.write('\n')
|
|
f.write('#ifndef BDDISASM_NO_MNEMONIC\n')
|
|
f.write('\n')
|
|
f.write('const char *gMnemonics[%d] = \n' % len(mnemonics))
|
|
f.write('{\n')
|
|
f.write(' ')
|
|
|
|
i = 0
|
|
ln = 0
|
|
for m in mnemonics:
|
|
f.write('"%s", ' % m)
|
|
ln += len(m) + 4
|
|
i += 1
|
|
if ln > 60:
|
|
ln = 0
|
|
f.write('\n ')
|
|
|
|
|
|
f.write('\n};\n')
|
|
|
|
f.write('\n')
|
|
f.write('#endif // !BDDISASM_NO_MNEMONIC\n')
|
|
|
|
f.write('\n\n')
|
|
|
|
f.write('#endif\n\n')
|
|
|
|
def dump_constants(constants, prefixes, constants_sets, constants_types, fname):
|
|
with open(fname, 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_CONSTANTS_H\n')
|
|
f.write('#define BDX86_CONSTANTS_H\n\n')
|
|
f.write('\n')
|
|
f.write('typedef enum _ND_INS_CLASS\n')
|
|
f.write('{\n')
|
|
f.write(' ND_INS_INVALID = 0,\n')
|
|
|
|
for c in constants:
|
|
f.write(' %s,\n' % c)
|
|
|
|
f.write('\n} ND_INS_CLASS;\n\n\n')
|
|
|
|
# Now the instruction sets.
|
|
f.write('typedef enum _ND_INS_SET\n')
|
|
f.write('{\n')
|
|
f.write(' ND_SET_INVALID = 0,\n')
|
|
for c in constants_sets:
|
|
f.write(' %s,\n' % c)
|
|
f.write('\n} ND_INS_SET;\n\n\n')
|
|
|
|
# Now the instruction types.
|
|
f.write('typedef enum _ND_INS_TYPE\n')
|
|
f.write('{\n')
|
|
f.write(' ND_CAT_INVALID = 0,\n')
|
|
for c in constants_types:
|
|
f.write(' %s,\n' % c)
|
|
f.write('\n} ND_INS_CATEGORY;\n\n\n')
|
|
|
|
# Done!
|
|
f.write('\n#endif\n')
|
|
|
|
def dump_master_table(instructions, fname):
|
|
with open(fname, 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_INSTRUCTIONS_H\n')
|
|
f.write('#define BDX86_INSTRUCTIONS_H\n')
|
|
f.write('\n')
|
|
flags = []
|
|
f.write('const ND_IDBE gInstructions[%s] = \n' % len(instructions))
|
|
f.write('{\n')
|
|
for i in instructions:
|
|
f.write('%s, \n\n' % cdef_instruction(i))
|
|
f.write('\n};\n')
|
|
f.write('\n#endif\n')
|
|
|
|
def dump_features(features, fname):
|
|
with open(fname, 'wt') as f:
|
|
f.write(header)
|
|
f.write('#ifndef BDX86_CPUID_FLAGS_H\n')
|
|
f.write('#define BDX86_CPUID_FLAGS_H\n')
|
|
|
|
f.write('\n')
|
|
f.write('#define ND_CFF_NO_LEAF 0xFFFFFFFF\n')
|
|
f.write('#define ND_CFF_NO_SUBLEAF 0x00FFFFFF\n')
|
|
f.write('\n')
|
|
f.write('\n')
|
|
f.write('#define ND_CFF(leaf, subleaf, reg, bit) ((ND_UINT64)(leaf) | ((ND_UINT64)((subleaf) & 0xFFFFFF) << 32) | ((ND_UINT64)(reg) << 56) | ((ND_UINT64)(bit) << 59))\n')
|
|
|
|
for c in features:
|
|
f.write('#define ND_CFF_%s%sND_CFF(%s, %s, %s, %s)\n' % (c.Name, ' ' * (25 - len(c.Name)), c.Leaf, c.SubLeaf, 'NDR_' + c.Reg, c.Bit))
|
|
|
|
f.write('\n')
|
|
|
|
f.write('#endif // CPUID_FLAGS_H\n')
|
|
|
|
#
|
|
# =============================================================================
|
|
# Main
|
|
# =============================================================================
|
|
#
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print('Usage: %s defs-file-dir' % os.path.basename(sys.argv[0]))
|
|
sys.exit(-1)
|
|
|
|
# Extract the flags.
|
|
print('Loading flags access templates...')
|
|
disasmlib.parse_flags_file('%s/flags.dat' % sys.argv[1])
|
|
|
|
# Extact the CPUID features.
|
|
print('Loading CPUID feature flags templates...')
|
|
features = disasmlib.parse_cff_file('%s/cpuid.dat' % sys.argv[1])
|
|
|
|
# Extract the valid modes.
|
|
print('Loading CPU operating modes templates...')
|
|
insmodes = disasmlib.parse_modess_file('%s/modes.dat' % sys.argv[1])
|
|
|
|
# Extract the instructions.
|
|
for fn in glob.glob('%s/table*.dat' % sys.argv[1]):
|
|
print('Loading instructions from %s...' % fn)
|
|
instructions = instructions + disasmlib.parse_ins_file(fn)
|
|
|
|
# Sort the instructions.
|
|
instructions = sorted(instructions, key = lambda x: x.Mnemonic)
|
|
for i in range(0, len(instructions)):
|
|
instructions[i].Icount = i
|
|
|
|
# Generate the mnemonics
|
|
mnemonics = generate_mnemonics(instructions)
|
|
mnemonics_prefixes = generate_mnemonics(prefixes)
|
|
|
|
# Generate the constants
|
|
constants = generate_constants(instructions)
|
|
constants_prefixes = generate_constants(prefixes, True)
|
|
constants_sets, constants_types = generate_constants2(instructions)
|
|
|
|
|
|
#
|
|
# Dump all data to files.
|
|
#
|
|
|
|
# Dump the mnemonics
|
|
print('Writing the bdx86_mnemonics.h (instruction mnemonics) file...')
|
|
dump_mnemonics(mnemonics, mnemonics_prefixes, r'../bddisasm/include/bdx86_mnemonics.h')
|
|
|
|
# Dump the instruction constants
|
|
print('Writing the bdx86_constants.h (instruction definitions) file...')
|
|
dump_constants(constants, constants_prefixes, constants_sets, constants_types, r'../inc/bdx86_constants.h')
|
|
|
|
# Dump the CPUID feature flags.
|
|
print('Writing the bdx86_cpuidflags.h (CPUID feature flags) file...')
|
|
dump_features(features, r'../inc/bdx86_cpuidflags.h')
|
|
|
|
# Dump the instruction database.
|
|
print('Writing the bdx86_instructions.h (main instruction database) file...')
|
|
dump_master_table(instructions, r'../bddisasm/include/bdx86_instructions.h')
|
|
|
|
# Dump the translation tables.
|
|
print('Writing the translation tables...')
|
|
dump_translation_tables(instructions)
|
|
|
|
print('Instruction successfully parsed & header files generated!')
|