1
0
mirror of https://github.com/bitdefender/bddisasm.git synced 2024-12-22 22:18:09 +00:00
bddisasm/isagenerator/generate_tables.py

965 lines
32 KiB
Python
Raw Normal View History

2020-07-21 08:19:18 +00:00
#!/usr/bin/env python3
#
# Copyright (c) 2024 Bitdefender
2020-07-21 08:19:18 +00:00
# SPDX-License-Identifier: Apache-2.0
#
import os
import sys
import re
import copy
import glob
import disasmlib
header = '''/*
* Copyright (c) 2024 Bitdefender
* SPDX-License-Identifier: Apache-2.0
*/
//
// This file was auto-generated by generate_tables.py. DO NOT MODIFY!
//
'''
2020-07-21 08:19:18 +00:00
# Set this to True to generate the instructions database with designated initializers format:
# ".FieldName = Value"
# Set this to False to generate the instructions database with aggregate initializers format:
# "/* FieldName */ Value"
# Designated initializers can be used when building as C code (recommended!) or C++ >= 20.
# Aggregate initializers must be used when building as C++ code (NOT recommended).
idbe_format_designated = True
2020-07-21 08:19:18 +00:00
#
# These are the encoding components used to group instructions. Important things to note:
# - The order here is important! The higher up (the lower the index) for a given component, the more priority it
# receives when grouping instructions
# - The first entry of each tuple is the component type/name
# - The second is a boolean indicating whether the component is mandatory for all or optional
#
# For example, "opcode" is mandatory for all instructions in a group, as we can't group them otherwise. However,
# the auxiliary entry is not mandatory - in a given group, some instructions may have such an entry, while other do
# not. Those that lack an optional component will be treated as "default" encoding.
#
# Example of optional component:
# 0x90 : NOP
# repz 0x90 : PAUSE
# In this case, the `repz` is an optional encoding component, meaning that its presence will cause PAUSE
# to be decoded, but its absence will cause NOP to be decoded.
#
# Example of mandatory component:
# 0x0F 0x00 /0:mem: SLDT Mw
# 0x0F 0x00 /0:reg: SLDT Rv
# In this case, all instructions must specify both the reg component (/0) and the mod (mem or reg).
# If any of them is present for one instruction but absent for another, will lead to an error. For example, the
# following spec will cause an error, since there's no grouping possible, because one of the instructions specifies
# mem mode, while the other does not specify anything:
# 0x0F 0x00 /0:mem : SLDT Mw
# 0x0F 0x00 /0 : SLDT Rv
#
components_legacy = [
{ 'type': 'opcode' , 'all': True },
{ 'type': 'opcode_last' , 'all': True },
{ 'type': 'vendor' , 'all': False },
{ 'type': 'feature' , 'all': False },
{ 'type': 'prefix' , 'all': True },
{ 'type': 'modrmreg' , 'all': True },
{ 'type': 'modrmmod' , 'all': True },
{ 'type': 'modrmrm' , 'all': True },
{ 'type': 'mode' , 'all': False },
{ 'type': 'dsize' , 'all': False },
{ 'type': 'asize' , 'all': False },
{ 'type': 'auxiliary' , 'all': False },
{ 'type': 'w' , 'all': True },
]
components_ex = [
{ 'type': 'mmmmm' , 'all': True },
{ 'type': 'opcode' , 'all': True },
{ 'type': 'pp' , 'all': True },
{ 'type': 'modrmreg' , 'all': True },
{ 'type': 'modrmmod' , 'all': True },
{ 'type': 'modrmrm' , 'all': True },
{ 'type': 'l' , 'all': True },
{ 'type': 'w' , 'all': True },
{ 'type': 'wi' , 'all': True },
{ 'type': 'nd' , 'all': True },
{ 'type': 'nf' , 'all': True },
{ 'type': 'sc' , 'all': True },
]
component_value_index = {
2020-07-21 08:19:18 +00:00
None : 0,
'None' : 0,
2020-07-21 08:19:18 +00:00
# modrm.mod
'mem' : 0,
'reg' : 1,
2020-07-21 08:19:18 +00:00
# mandatory prefixes; using 'P' prefix so they're not confused with an opcode
'PNP' : 0,
'P0x66' : 1,
'P0xF3' : 2,
'P0xF2' : 3,
2020-07-21 08:19:18 +00:00
# other prefixes/redirection conditions
'rexb' : 1,
'rexw' : 2,
'mo64' : 3,
'repz' : 4,
'rep' : 5,
'riprel': 6,
'rex2' : 7,
'rex2w' : 8,
2020-07-21 08:19:18 +00:00
# Mode
'm16' : 1,
'm32' : 2,
'm64' : 3,
2020-07-21 08:19:18 +00:00
# Default data size
'ds16' : 1,
'ds32' : 2,
'ds64' : 3,
'dds64' : 4,
'fds64' : 5,
2020-07-21 08:19:18 +00:00
# Default address size
'as16' : 1,
'as32' : 2,
'as64' : 3,
2020-07-21 08:19:18 +00:00
# Vendor redirection.
'any' : 0,
'intel' : 1,
'amd' : 2,
# Feature redirection.
'mpx' : 1,
'cet' : 2,
'cldm' : 3,
'piti' : 4,
2020-07-21 08:19:18 +00:00
}
#
# This dictionary describes how the decoding tables look. Each decoding component has associated a C decoding table.
#
components_ilut = {
'opcode' : { 'ilut': 'ND_ILUT_OPCODE', 'size': 256, 'type': 'ND_TABLE_OPCODE' },
'opcode_last' : { 'ilut': 'ND_ILUT_OPCODE_LAST', 'size': 256, 'type': 'ND_TABLE_OPCODE' },
'modrmmod' : { 'ilut': 'ND_ILUT_MODRM_MOD', 'size': 2, 'type': 'ND_TABLE_MODRM_MOD' },
'modrmreg' : { 'ilut': 'ND_ILUT_MODRM_REG', 'size': 8, 'type': 'ND_TABLE_MODRM_REG' },
'modrmrm' : { 'ilut': 'ND_ILUT_MODRM_RM', 'size': 8, 'type': 'ND_TABLE_MODRM_RM' },
'prefix' : { 'ilut': 'ND_ILUT_MAN_PREFIX', 'size': 4, 'type': 'ND_TABLE_MPREFIX' },
'mode' : { 'ilut': 'ND_ILUT_MODE', 'size': 4, 'type': 'ND_TABLE_MODE' },
'dsize' : { 'ilut': 'ND_ILUT_DSIZE', 'size': 6, 'type': 'ND_TABLE_DSIZE' },
'asize' : { 'ilut': 'ND_ILUT_ASIZE', 'size': 4, 'type': 'ND_TABLE_ASIZE' },
'auxiliary' : { 'ilut': 'ND_ILUT_AUXILIARY', 'size': 10, 'type': 'ND_TABLE_AUXILIARY' },
'vendor' : { 'ilut': 'ND_ILUT_VENDOR', 'size': 6, 'type': 'ND_TABLE_VENDOR' },
'feature' : { 'ilut': 'ND_ILUT_FEATURE', 'size': 8, 'type': 'ND_TABLE_FEATURE' },
'mmmmm' : { 'ilut': 'ND_ILUT_EX_M', 'size': 32, 'type': 'ND_TABLE_EX_M' },
'pp' : { 'ilut': 'ND_ILUT_EX_PP', 'size': 4, 'type': 'ND_TABLE_EX_PP' },
'l' : { 'ilut': 'ND_ILUT_EX_L', 'size': 4, 'type': 'ND_TABLE_EX_L' },
'w' : { 'ilut': 'ND_ILUT_EX_W', 'size': 2, 'type': 'ND_TABLE_EX_W' },
'wi' : { 'ilut': 'ND_ILUT_EX_WI', 'size': 2, 'type': 'ND_TABLE_EX_W' },
'nd' : { 'ilut': 'ND_ILUT_EX_ND', 'size': 2, 'type': 'ND_TABLE_EX_ND' },
'nf' : { 'ilut': 'ND_ILUT_EX_NF', 'size': 2, 'type': 'ND_TABLE_EX_NF' },
'sc' : { 'ilut': 'ND_ILUT_EX_SC', 'size': 16, 'type': 'ND_TABLE_EX_SC' },
2020-07-21 08:19:18 +00:00
}
mnemonics = []
mnemonics_prefix = []
instructions = []
prefixes = []
features = []
def instrux_to_idbe(
ins: disasmlib.Instruction
) -> dict:
"""
Generates a dictionary equivalent to the ND_IDBE structure. Each dictionary key is equivalent
to a ND_IDBE structure field. Restrictions:
- The order of the keys must be identical to the order of fields inside ND_IDBE
- There must be no gaps; if a field is not used, just initialize it to some default value
- The names must be identical to field names inside ND_IDBE
2020-07-21 08:19:18 +00:00
Parameters
----------
ins: disasmlib.Instruction
The instruction to be converted to a dictionary.
2020-07-21 08:19:18 +00:00
Returns
-------
A dictionary representing the bddisasm C definition of this instrux.
"""
d = {}
2020-07-21 08:19:18 +00:00
# Instruction class
d['Instruction'] = 'ND_INS_' + ins.Class
2020-07-21 08:19:18 +00:00
# Instruction Category
d['Category'] = 'ND_CAT_' + ins.Category
2020-07-21 08:19:18 +00:00
# ISA Set
d['IsaSet'] = 'ND_SET_' + ins.Set
2020-07-21 08:19:18 +00:00
# Mnemonic (index)
d['Mnemonic'] = '%d' % (mnemonics.index(ins.Mnemonic))
# Accepted prefixes map
if ins.Prefmap:
d['ValidPrefixes'] = '|'.join(['ND_PREF_' + x.upper() for x in ins.Prefmap])
else:
d['ValidPrefixes'] = '0'
2020-07-21 08:19:18 +00:00
# Valid modes
2020-07-21 08:19:18 +00:00
all = True
smodes = ''
for m in disasmlib.valid_cpu_modes:
if m not in ins.Modes:
2020-07-21 08:19:18 +00:00
all = False
if all:
smodes = 'ND_MOD_ANY'
2020-07-21 08:19:18 +00:00
else:
smodes = '|'.join(['ND_MOD_' + m.upper() for m in ins.Modes])
d['ValidModes'] = smodes
2020-07-21 08:19:18 +00:00
# Valid decorators
if ins.DecoFlags:
d['ValidDecorators'] = '|'.join(['ND_DECO_' + x.upper() for x in ins.DecoFlags])
else:
d['ValidDecorators'] = '0'
2020-07-21 08:19:18 +00:00
# Operand count
d['OpsCount'] = 'ND_OPS_CNT(%d, %d)' % (len(ins.ExpOps), len(ins.ImpOps))
2020-07-21 08:19:18 +00:00
# EVEX tuple type
if ins.Evex and ins.Tuple:
d['TupleType'] = 'ND_TUPLE_' + ins.Tuple.upper()
2020-07-21 08:19:18 +00:00
else:
d['TupleType'] = '0'
2020-07-21 08:19:18 +00:00
# Exception type
if ins.ExType:
d['ExcType'] = 'ND_EXT_' + ins.ExType
2020-07-21 08:19:18 +00:00
else:
d['ExcType'] = '0'
2020-07-21 08:19:18 +00:00
# FpuFlags (x87 instructions only)
if ins.Set == 'X87':
2020-07-21 08:19:18 +00:00
value = 0
acc = { '0': 0, '1': 1, 'm': 2, 'u': 3 }
for i in range(0, 4):
value |= acc[ins.FpuFlags[i]] << (i * 2)
d['FpuFlags'] = '0x%02x' % value
2020-07-21 08:19:18 +00:00
else:
d['FpuFlags'] = '0'
2020-07-21 08:19:18 +00:00
# EVEX mode
if ins.EvexMode:
d['EvexMode'] = 'ND_EVEXM_' + ins.EvexMode.upper()
else:
d['EvexMode'] = '0'
2020-07-21 08:19:18 +00:00
# Flags (tested, modified, set, cleared)
2020-07-21 08:19:18 +00:00
for m in ['t', 'm', '1', '0']:
flg = '0'
dst = ins.Rflags[m]
2020-07-21 08:19:18 +00:00
if m == '1' or m == '0':
dst = dst + ins.Rflags['u']
2020-07-21 08:19:18 +00:00
for f in dst:
flg += '|NDR_RFLAG_%s' % f.upper()
if m == 't': d['TestedFlags'] = flg
if m == 'm': d['ModifiedFlags'] = flg
if m == '1': d['SetFlags'] = flg
if m == '0': d['ClearedFlags'] = flg
# Instruction attributes
fs = '|'.join(['ND_FLAG_' + x.upper() for x in ins.Attributes
if x != 'nil' and not x.startswith('OP1') and not x.startswith('OP2')
and not x.startswith('OP3') and not x.startswith('OP4')
and not x.startswith('OP5') and not x.startswith('OP6')
]) or '0'
d['Attributes'] = fs
# CPUID flag
flg = '0'
for feat in features:
if feat.Name == ins.Id:
flg = 'ND_CFF_%s' % feat.Name
d['CpuidFlag'] = flg
2020-07-21 08:19:18 +00:00
# List of instruction operands
d['Operands'] = []
for op in ins.ExpOps + ins.ImpOps:
d['Operands'].append(cdef_operand(op))
2020-07-21 08:19:18 +00:00
return d
def cdef_operand(
op: disasmlib.Operand
) -> str:
"""
Generates a bddisasm C definition for the current operand.
Parameters
----------
op: Operand
The operand to be converted in a C definition.
Returns
-------
A string representing the bddisasm C definition of this operand.
Example
-------
"OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0)"
"""
return 'OP(%s, %s, %s, %s, %s, %d)' % (
'ND_OPT_' + op.Type,
'ND_OPS_' + (op.Size if op.Size != '?' else 'unknown'),
'|'.join(['ND_OPF_' + x for x in op.Flags]) or '0',
'ND_OPA_' + op.Access,
'|'.join(['ND_OPD_' + disasmlib.deco_op_flags[x] for x in op.Decorators]) or 0,
op.Block)
def cdef_instruction(
ins: disasmlib.Instruction
) -> str:
"""
Generates a bddisasm C or CPP definition for the current instruction.
If C style definition is used, designated initializers are used.
If CPP definition is required, aggregate initialization is used.
Parameters
----------
ins: Instruction
The instruction to be converted in a C structure.
Returns
-------
A multi-line string representing the bddisasm C or CPP definition of this instruction.
Example
-------
Designated initializer definition:
// Pos:3 Instruction:"AADD My,Gy" Encoding:"NP 0x0F 0x38 0xFC /r:mem"/"MR"
{
.Instruction = ND_INS_AADD,
.Category = ND_CAT_RAOINT,
.IsaSet = ND_SET_RAOINT,
.Mnemonic = 2,
.ValidPrefixes = 0,
.ValidModes = ND_MOD_ANY,
.ValidDecorators = 0,
.OpsCount = ND_OPS_CNT(2, 0),
.TupleType = 0,
.ExcType = 0,
.FpuFlags = 0,
.EvexMode = 0,
.TestedFlags = 0,
.ModifiedFlags = 0,
.SetFlags = 0,
.ClearedFlags = 0,
.Attributes = ND_FLAG_NOREX2|ND_FLAG_MODRM,
.CpuidFlag = ND_CFF_RAOINT,
.Operands =
{
OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0),
OP(ND_OPT_G, ND_OPS_y, 0, ND_OPA_R, 0, 0),
},
},
Aggregate initializer definition:
// Pos:3 Instruction:"AADD My,Gy" Encoding:"NP 0x0F 0x38 0xFC /r:mem"/"MR"
{
/* Instruction */ ND_INS_AADD,
/* Category */ ND_CAT_RAOINT,
/* IsaSet */ ND_SET_RAOINT,
/* Mnemonic */ 2,
/* ValidPrefixes */ 0,
/* ValidModes */ ND_MOD_ANY,
/* ValidDecorators */ 0,
/* OpsCount */ ND_OPS_CNT(2, 0),
/* TupleType */ 0,
/* ExcType */ 0,
/* FpuFlags */ 0,
/* EvexMode */ 0,
/* TestedFlags */ 0,
/* ModifiedFlags */ 0,
/* SetFlags */ 0,
/* ClearedFlags */ 0,
/* Attributes */ ND_FLAG_NOREX2|ND_FLAG_MODRM,
/* CpuidFlag */ ND_CFF_RAOINT,
/* Operands */
{
OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0),
OP(ND_OPT_G, ND_OPS_y, 0, ND_OPA_R, 0, 0),
},
},
"""
idbe = instrux_to_idbe(ins)
2020-07-21 08:19:18 +00:00
c = ''
2020-07-21 08:19:18 +00:00
# Start with the position and encoding description.
c += ' // Pos:%d Instruction:"%s" Encoding:"%s"/"%s"\n' % (
ins.Icount,
str(ins),
ins.RawEnc,
''.join([x.Encoding for x in ins.ExpOps]).replace('S', ''))
2020-07-21 08:19:18 +00:00
c += ' {\n'
for field in idbe:
if idbe_format_designated:
c += ' .%s = ' % field
2020-07-21 08:19:18 +00:00
else:
c += ' /* %16s */ ' % field
if type(idbe[field]) is list:
c += '\n'
c += ' {\n'
if len(idbe[field]) == 0:
c += ' 0\n'
else:
for entry in idbe[field]:
c += ' ' + entry + ',\n'
c += ' },\n'
2020-07-21 08:19:18 +00:00
else:
c += str(idbe[field]) + ',\n'
2020-07-21 08:19:18 +00:00
c += ' }'
2020-07-21 08:19:18 +00:00
return c
2020-07-21 08:19:18 +00:00
def compute_index(
value: str
) -> int:
"""
Given a component value, convert it to an index inside te C decoding table. Values which are present inside the
component_value_index dict will be translated using that. All other values will be considered to be hex values,
so they will be int(value, 16). The returned index is used when decoding an instruction, in order to lookup the
next viable entry in the multi-way decode tree.
Parameters
----------
value: str
The index to be converted to an integer index.
Returns
-------
An integer representing the index of the given decode component.
Example
-------
Input:
value: repz
Returns:
4
Input:
value: 0xCC
Returns:
204
"""
if value in component_value_index:
return component_value_index[value]
return int(value, 16)
def group_find_component(
instructions: list[disasmlib.Instruction],
components: list[dict]
) -> dict:
"""
Given a list of instructions and a list of decoding components, return the decoding component that covers all the
instructions in the list. For example, in an initial call to this function for the list of all legacy instructions
the "opcode" component would be returned.
Parameters
----------
instructions: list[disasmlib.Instruction]
The list of instructions to be grouped.
components:
The list of components used for grouping.
Returns
-------
A dict representing the decode component that can be used to cover all instructions in the list.
None if no such component could be found.
"""
for c in components:
if c['all']:
# Some components must be present for all instructions in the list - for example, opcode.
bad = False
for i in instructions:
if not i.Encoding[c['type']]:
bad = True
break
2020-07-21 08:19:18 +00:00
else:
# Optional components need only be present for a single instruction in the list.
bad = True
for i in instructions:
if i.Encoding[c['type']]:
bad = False
break
if bad:
continue
return c
return None
def group_instructions(
instructions: list[disasmlib.Instruction],
components: list[dict]
) -> dict:
"""
Given a list of instructions and a list of decoding components, find the best grouping component, distribute all
instructions inside an array of children entries based on the identified grouping component, and recurse for all
children entries, until we are left with leaf entries only. A leaf entry is composed of a single Instruction
object.
Parameters
----------
instructions: list[disasmlib.Instruction]
The list of instructions to be grouped.
components:
The list of components used for grouping.
Returns
-------
A dictionary containing two keys:
- "component": indicates the component type used for the current grouping. It is one of "components_legacy"
or "components_ex", depending on which was used for the grouping.
- "children": an array of N entries, where each entry of the array contains an array of instructions that can
be further grouped. The size N of the "children" array is given by the number of possible entries for the
given "component". For example, an "opcode" component can have up to 256 values, so "children" will have
256 entries. A "modrmreg" component can have up to 8 values, so "children" will have 8 entries.
Example
-------
Consider the following list of (simplified) initial instructions (only opcode and reduced encoding shown):
[{"I1", "0xBD"}, {"I2", "0xCC"}, {"I4", "FF /1"}, {"I5", "FF /5"}]
During the first call, "opcode" would be chosen to group the instructions, so we would end up with the following
result:
{
"component": "opcode",
"children": [
...
Pos 0xBD: [{"I1", "0xBD"}],
...
Pos 0xCC: [{"I2", "0xCC"}],
...
Pos 0xFF: [{"I4", "FF /1"}, {"I5", "FF /5"}]
]
}
We would then recurse for each child in the children array. Note that for opcodes 0xBD and 0xCC, we already have
leaf entries, so further grouping will not be required.
For opcode 0xFF, further grouping is needed. At the next step, the "modrmreg" will be chosen for grouping, with
the following result:
{
"component": "modrmreg",
"children": [
Pos 0: []
Pos 1: [{"I4", "FF /1"}]
Pos 2: []
Pos 3: []
Pos 4: []
Pos 5: [{"I5", "FF /5"}]
Pos 6: []
Pos 7: []
]
}
As in the previous example, we would recurse for each child, but we are already at leaf entries, so no more grouping
is required.
"""
group = {
'component' : None, # Component type, used to decode children instructions
'children' : None, # Array of sub-groups. Each entry is an array of instructions that will be further groupes.
}
# Find a good grouping component for the current instruction list.
comp = group_find_component(instructions, components)
# If no good component was found, we probably reached a leaf entry.
if not comp and len(instructions) == 1:
# Reached leaf entry, no more grouping needed.
group['component'] = 'leaf'
group['children'] = instructions[0]
return group
elif not comp:
# No grouping component found for multiple instructions - error.
print("ERROR: Cannot properly group the following instructions. Please review specs!")
for i in instructions: print(" -> ", i, " with encoding: ", i.RawEnc)
raise Exception("Grouping error: invalid/incomplete specification!")
# Allocate the sub-group array, based on the number of entries in the current group.
group['component'] = comp['type']
group['children'] = []
glen = components_ilut[comp['type']]['size']
for i in range(0, glen):
group['children'].append([])
# Now go through every instruction in the current group, and distribute it on its position.
# Note that at each grouping step, we pop the used component from the instruction
# encoding array, so that it's not used again.
for i in instructions:
if len(i.Encoding[comp['type']]) > 0:
index = compute_index(i.Encoding[comp['type']].pop(0))
else:
index = 0
group['children'][index].append(i)
2020-07-21 08:19:18 +00:00
# Now recurse, and group every sub-group of instructions.
for i in range(0, glen):
# Skip empty groups.
if not group['children'][i]:
continue
2020-07-21 08:19:18 +00:00
# Recursively group instructions.
group['children'][i] = group_instructions(group['children'][i], components)
2020-07-21 08:19:18 +00:00
return group
2020-07-21 08:19:18 +00:00
def group_dump(
group: map,
level: int = 0
):
"""
Dump the entire translation tree identified by the root "group".
"""
if group['component'] == 'leaf':
print(" " * level, group['children'])
return
for i in range(0, len(group['children'])):
if not group['children'][i]:
continue
print(" " * level, group['component'], '%02x' % i)
group_dump(group['children'][i], level + 1)
def dump_translation_tables(
instructions: list[disasmlib.Instruction]
):
"""
Generate the instruction translation trees.
"""
table_legacy = []
table_xop = []
table_vex = []
table_evex = []
group_legacy = {}
group_vex = {}
group_xop = {}
group_evex = {}
2020-07-21 08:19:18 +00:00
# Distribute each instruction type into its own table.
for i in instructions:
if i.Vex: table_vex.append(i)
elif i.Xop: table_xop.append(i)
elif i.Evex: table_evex.append(i)
else: table_legacy.append(i)
#
# Legacy map.
#
group_legacy = group_instructions(table_legacy, components_legacy)
group_cdef = group_generate_c_table(group_legacy, 'gLegacyMap_%s' % group_legacy['component'])
print('Writing the bdx86_table_root.h file...')
with open(r'../bddisasm/include/bdx86_table_root.h', 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_TABLE_ROOT_H\n')
f.write('#define BDX86_TABLE_ROOT_H\n\n')
f.write(group_cdef)
f.write('\n#endif\n\n')
#
# VEX map.
#
group_vex = group_instructions(table_vex, components_ex)
group_cdef = group_generate_c_table(group_vex, 'gVexMap_%s' % group_vex['component'])
print('Writing the bdx86_table_vex.h file...')
with open(r'../bddisasm/include/bdx86_table_vex.h', 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_TABLE_VEX_H\n')
f.write('#define BDX86_TABLE_VEX_H\n\n')
f.write(group_cdef)
f.write('\n#endif\n\n')
#
# XOP map.
#
group_xop = group_instructions(table_xop, components_ex)
group_cdef = group_generate_c_table(group_xop, 'gXopMap_%s' % group_xop['component'])
print('Writing the bdx86_table_xop.h file...')
with open(r'../bddisasm/include/bdx86_table_xop.h', 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_TABLE_XOP_H\n')
f.write('#define BDX86_TABLE_XOP_H\n\n')
f.write(group_cdef)
f.write('\n#endif\n\n')
#
# EVEX map.
#
group_evex = group_instructions(table_evex, components_ex)
group_cdef = group_generate_c_table(group_evex, 'gEvexMap_%s' % group_evex['component'])
print('Writing the bdx86_table_evex.h file...')
with open(r'../bddisasm/include/bdx86_table_evex.h', 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_TABLE_EVEX_H\n')
f.write('#define BDX86_TABLE_EVEX_H\n\n')
f.write(group_cdef)
f.write('\n#endif\n\n')
def group_generate_c_table(
group: map,
name: str
) -> str:
"""
Generate the translation tree, in C format, for the decoding tree identified by group.
"""
if group['component'] != 'leaf':
current_table = ''
current_table += 'const %s %s = \n' % (components_ilut[group['component']]['type'], name)
current_table += '{\n'
current_table += ' %s,\n' % components_ilut[group['component']]['ilut']
current_table += ' {\n'
for i in range(0, len(group['children'])):
if not group['children'][i]:
current_table += ' /* %02x */ (const void *)ND_NULL,\n' % (i)
else:
current_name = name + ('_%02x_%s' % (i, group['children'][i]['component']))
current_table += ' /* %02x */ (const void *)&%s,\n' % (i, current_name)
current_table = group_generate_c_table(group['children'][i], current_name) + current_table
current_table += ' }\n'
current_table += '};\n\n'
return current_table
else:
# Instruction, construct a dummy table that directly points to the instruction.
res = 'const ND_TABLE_INSTRUCTION %s = \n' % name
res += '{\n'
res += ' ND_ILUT_INSTRUCTION,\n'
res += ' (const void *)&gInstructions[% 4d] // %s\n' % (group['children'].Icount, str(group['children']))
res += '};\n\n'
return res
2020-07-21 08:19:18 +00:00
def generate_mnemonics(instructions):
mnemonics = []
for i in instructions:
mnemonics.append(i.Mnemonic)
return sorted(set(mnemonics))
def generate_constants(lst, pre = False):
constants = []
for i in lst:
if pre:
constants.append('ND_PRE_' + i.Mnemonic)
else:
constants.append('ND_INS_' + i.Class)
return sorted(set(constants))
2020-07-21 08:19:18 +00:00
def generate_constants2(instructions):
constants_sets, constants_types = [], []
for i in instructions:
constants_sets.append('ND_SET_' + i.Set)
constants_types.append('ND_CAT_' + i.Category)
return sorted(set(constants_sets)), sorted(set(constants_types))
def dump_mnemonics(mnemonics, prefixes, fname):
with open(fname, 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_MNEMONICS_H\n')
f.write('#define BDX86_MNEMONICS_H\n')
f.write('\n')
f.write('#ifndef BDDISASM_NO_MNEMONIC\n')
f.write('\n')
f.write('const char *gMnemonics[%d] = \n' % len(mnemonics))
f.write('{\n')
f.write(' ')
2020-07-21 08:19:18 +00:00
i = 0
ln = 0
for m in mnemonics:
f.write('"%s", ' % m)
ln += len(m) + 4
2020-07-21 08:19:18 +00:00
i += 1
if ln > 60:
ln = 0
f.write('\n ')
2020-07-21 08:19:18 +00:00
f.write('\n};\n')
f.write('\n')
f.write('#endif // !BDDISASM_NO_MNEMONIC\n')
f.write('\n\n')
2020-07-21 08:19:18 +00:00
f.write('#endif\n\n')
2020-07-21 08:19:18 +00:00
def dump_constants(constants, prefixes, constants_sets, constants_types, fname):
with open(fname, 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_CONSTANTS_H\n')
f.write('#define BDX86_CONSTANTS_H\n\n')
f.write('\n')
f.write('typedef enum _ND_INS_CLASS\n')
f.write('{\n')
f.write(' ND_INS_INVALID = 0,\n')
for c in constants:
f.write(' %s,\n' % c)
f.write('\n} ND_INS_CLASS;\n\n\n')
# Now the instruction sets.
f.write('typedef enum _ND_INS_SET\n')
f.write('{\n')
f.write(' ND_SET_INVALID = 0,\n')
for c in constants_sets:
f.write(' %s,\n' % c)
f.write('\n} ND_INS_SET;\n\n\n')
# Now the instruction types.
f.write('typedef enum _ND_INS_TYPE\n')
f.write('{\n')
f.write(' ND_CAT_INVALID = 0,\n')
for c in constants_types:
f.write(' %s,\n' % c)
f.write('\n} ND_INS_CATEGORY;\n\n\n')
# Done!
f.write('\n#endif\n')
def dump_master_table(instructions, fname):
with open(fname, 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_INSTRUCTIONS_H\n')
f.write('#define BDX86_INSTRUCTIONS_H\n')
f.write('\n')
flags = []
f.write('const ND_IDBE gInstructions[%s] = \n' % len(instructions))
f.write('{\n')
for i in instructions:
f.write('%s, \n\n' % cdef_instruction(i))
f.write('\n};\n')
f.write('\n#endif\n')
def dump_features(features, fname):
with open(fname, 'wt') as f:
f.write(header)
f.write('#ifndef BDX86_CPUID_FLAGS_H\n')
f.write('#define BDX86_CPUID_FLAGS_H\n')
f.write('\n')
f.write('#define ND_CFF_NO_LEAF 0xFFFFFFFF\n')
f.write('#define ND_CFF_NO_SUBLEAF 0x00FFFFFF\n')
f.write('\n')
f.write('\n')
f.write('#define ND_CFF(leaf, subleaf, reg, bit) ((ND_UINT64)(leaf) | ((ND_UINT64)((subleaf) & 0xFFFFFF) << 32) | ((ND_UINT64)(reg) << 56) | ((ND_UINT64)(bit) << 59))\n')
for c in features:
f.write('#define ND_CFF_%s%sND_CFF(%s, %s, %s, %s)\n' % (c.Name, ' ' * (25 - len(c.Name)), c.Leaf, c.SubLeaf, 'NDR_' + c.Reg, c.Bit))
f.write('\n')
f.write('#endif // CPUID_FLAGS_H\n')
2020-07-21 08:19:18 +00:00
#
# =============================================================================
# Main
# =============================================================================
#
if __name__ == "__main__":
if len(sys.argv) < 2:
print('Usage: %s defs-file-dir' % os.path.basename(sys.argv[0]))
2020-07-21 08:19:18 +00:00
sys.exit(-1)
# Extract the flags.
print('Loading flags access templates...')
disasmlib.parse_flags_file('%s/flags.dat' % sys.argv[1])
2020-07-21 08:19:18 +00:00
# Extact the CPUID features.
print('Loading CPUID feature flags templates...')
features = disasmlib.parse_cff_file('%s/cpuid.dat' % sys.argv[1])
# Extract the valid modes.
print('Loading CPU operating modes templates...')
insmodes = disasmlib.parse_modess_file('%s/modes.dat' % sys.argv[1])
# Extract the instructions.
for fn in glob.glob('%s/table*.dat' % sys.argv[1]):
print('Loading instructions from %s...' % fn)
instructions = instructions + disasmlib.parse_ins_file(fn)
2020-07-21 08:19:18 +00:00
# Sort the instructions.
instructions = sorted(instructions, key = lambda x: x.Mnemonic)
for i in range(0, len(instructions)):
instructions[i].Icount = i
# Generate the mnemonics
mnemonics = generate_mnemonics(instructions)
mnemonics_prefixes = generate_mnemonics(prefixes)
# Generate the constants
constants = generate_constants(instructions)
constants_prefixes = generate_constants(prefixes, True)
constants_sets, constants_types = generate_constants2(instructions)
2020-07-21 08:19:18 +00:00
#
# Dump all data to files.
#
# Dump the mnemonics
print('Writing the bdx86_mnemonics.h (instruction mnemonics) file...')
dump_mnemonics(mnemonics, mnemonics_prefixes, r'../bddisasm/include/bdx86_mnemonics.h')
2020-07-21 08:19:18 +00:00
# Dump the instruction constants
print('Writing the bdx86_constants.h (instruction definitions) file...')
dump_constants(constants, constants_prefixes, constants_sets, constants_types, r'../inc/bdx86_constants.h')
# Dump the CPUID feature flags.
print('Writing the bdx86_cpuidflags.h (CPUID feature flags) file...')
dump_features(features, r'../inc/bdx86_cpuidflags.h')
2020-07-21 08:19:18 +00:00
# Dump the instruction database.
print('Writing the bdx86_instructions.h (main instruction database) file...')
dump_master_table(instructions, r'../bddisasm/include/bdx86_instructions.h')
2020-07-21 08:19:18 +00:00
# Dump the translation tables.
print('Writing the translation tables...')
dump_translation_tables(instructions)
2020-07-21 08:19:18 +00:00
print('Instruction successfully parsed & header files generated!')