bddisasm/isagenerator/generate_tables.py

#!/usr/bin/env python3
#
# Copyright (c) 2024 Bitdefender
# SPDX-License-Identifier: Apache-2.0
#
import os
import sys
import re
import copy
import glob
import disasmlib

header = '''/*
 * Copyright (c) 2024 Bitdefender
 * SPDX-License-Identifier: Apache-2.0
 */

//
// This file was auto-generated by generate_tables.py. DO NOT MODIFY!
//

'''

# Set this to True to generate the instructions database with designated initializers format:
#   ".FieldName = Value"
# Set this to False to generate the instructions database with aggregate initializers format:
#   "/* FieldName */ Value"
# Designated initializers can be used when building as C code (recommended!) or C++ >= 20.
# Aggregate initializers must be used when building as C++ code (NOT recommended).
idbe_format_designated = True


#
# These are the encoding components used to group instructions. Important things to note:
# - The order here is important! The higher up (the lower the index) for a given component, the more priority it 
#   receives when grouping instructions 
# - The first entry of each tuple is the component type/name
# - The second is a boolean indicating whether the component is mandatory for all or optional
#
# For example, "opcode" is mandatory for all instructions in a group, as we can't group them otherwise. However,
# the auxiliary entry is not mandatory - in a given group, some instructions may have such an entry, while other do
# not. Those that lack an optional component will be treated as "default" encoding.
#
# Example of optional component:
#        0x90 : NOP
#   repz 0x90 : PAUSE
# In this case, the `repz` is an optional encoding component, meaning that its presence will cause PAUSE
# to be decoded, but its absence will cause NOP to be decoded.
#
# Example of mandatory component:
#   0x0F 0x00 /0:mem: SLDT Mw
#   0x0F 0x00 /0:reg: SLDT Rv
# In this case, all instructions must specify both the reg component (/0) and the mod (mem or reg). 
# If any of them is present for one instruction but absent for another, will lead to an error. For example, the 
# following spec will cause an error, since there's no grouping possible, because one of the instructions specifies 
# mem mode, while the other does not specify anything:
#   0x0F 0x00 /0:mem : SLDT Mw
#   0x0F 0x00 /0     : SLDT Rv
#
components_legacy = [
    { 'type': 'opcode'      , 'all': True  },
    { 'type': 'opcode_last' , 'all': True  },
    { 'type': 'vendor'      , 'all': False },
    { 'type': 'feature'     , 'all': False },
    { 'type': 'prefix'      , 'all': True  },
    { 'type': 'modrmreg'    , 'all': True  },
    { 'type': 'modrmmod'    , 'all': True  },
    { 'type': 'modrmrm'     , 'all': True  },
    { 'type': 'mode'        , 'all': False },
    { 'type': 'dsize'       , 'all': False },
    { 'type': 'asize'       , 'all': False },
    { 'type': 'auxiliary'   , 'all': False },
    { 'type': 'w'           , 'all': True  },
]

components_ex = [
    { 'type': 'mmmmm'       , 'all': True  },
    { 'type': 'opcode'      , 'all': True  },
    { 'type': 'pp'          , 'all': True  },
    { 'type': 'modrmreg'    , 'all': True  },
    { 'type': 'modrmmod'    , 'all': True  },
    { 'type': 'modrmrm'     , 'all': True  },
    { 'type': 'l'           , 'all': True  },
    { 'type': 'w'           , 'all': True  },
    { 'type': 'wi'          , 'all': True  },
    { 'type': 'nd'          , 'all': True  },
    { 'type': 'nf'          , 'all': True  },
    { 'type': 'sc'          , 'all': True  },
]


component_value_index = {
    None    : 0,
    'None'  : 0,

    # modrm.mod
    'mem'   : 0,
    'reg'   : 1,

    # mandatory prefixes; using 'P' prefix so they're not confused with an opcode
    'PNP'   : 0,
    'P0x66' : 1,
    'P0xF3' : 2,
    'P0xF2' : 3,

    # other prefixes/redirection conditions
    'rexb'  : 1,
    'rexw'  : 2,
    'mo64'  : 3,
    'repz'  : 4,
    'rep'   : 5,
    'riprel': 6,
    'rex2'  : 7,
    'rex2w' : 8,
    
    # Mode
    'm16'   : 1,
    'm32'   : 2,
    'm64'   : 3,
    
    # Default data size
    'ds16'  : 1,
    'ds32'  : 2,
    'ds64'  : 3,
    'dds64' : 4,
    'fds64' : 5,

    # Default address size
    'as16'  : 1,
    'as32'  : 2,
    'as64'  : 3,

    # Vendor redirection.
    'any'   : 0,
    'intel' : 1,
    'amd'   : 2,

    # Feature redirection.
    'mpx'   : 1,
    'cet'   : 2,
    'cldm'  : 3,
    'piti'  : 4,
}


#
# This dictionary describes how the decoding tables look. Each decoding component has associated a C decoding table.
#
components_ilut = {
    'opcode' :      { 'ilut': 'ND_ILUT_OPCODE',      'size': 256, 'type': 'ND_TABLE_OPCODE'     },
    'opcode_last' : { 'ilut': 'ND_ILUT_OPCODE_LAST', 'size': 256, 'type': 'ND_TABLE_OPCODE'     },
    'modrmmod' :    { 'ilut': 'ND_ILUT_MODRM_MOD',   'size': 2,   'type': 'ND_TABLE_MODRM_MOD'  },
    'modrmreg' :    { 'ilut': 'ND_ILUT_MODRM_REG',   'size': 8,   'type': 'ND_TABLE_MODRM_REG'  },
    'modrmrm' :     { 'ilut': 'ND_ILUT_MODRM_RM',    'size': 8,   'type': 'ND_TABLE_MODRM_RM'   },
    'prefix' :      { 'ilut': 'ND_ILUT_MAN_PREFIX',  'size': 4,   'type': 'ND_TABLE_MPREFIX'    },
    'mode' :        { 'ilut': 'ND_ILUT_MODE',        'size': 4,   'type': 'ND_TABLE_MODE'       },
    'dsize' :       { 'ilut': 'ND_ILUT_DSIZE',       'size': 6,   'type': 'ND_TABLE_DSIZE'      },
    'asize' :       { 'ilut': 'ND_ILUT_ASIZE',       'size': 4,   'type': 'ND_TABLE_ASIZE'      },
    'auxiliary' :   { 'ilut': 'ND_ILUT_AUXILIARY',   'size': 10,  'type': 'ND_TABLE_AUXILIARY'  },
    'vendor' :      { 'ilut': 'ND_ILUT_VENDOR',      'size': 6,   'type': 'ND_TABLE_VENDOR'     },
    'feature' :     { 'ilut': 'ND_ILUT_FEATURE',     'size': 8,   'type': 'ND_TABLE_FEATURE'    },
    'mmmmm' :       { 'ilut': 'ND_ILUT_EX_M',        'size': 32,  'type': 'ND_TABLE_EX_M'       },
    'pp' :          { 'ilut': 'ND_ILUT_EX_PP',       'size': 4,   'type': 'ND_TABLE_EX_PP'      },
    'l' :           { 'ilut': 'ND_ILUT_EX_L',        'size': 4,   'type': 'ND_TABLE_EX_L'       },
    'w' :           { 'ilut': 'ND_ILUT_EX_W',        'size': 2,   'type': 'ND_TABLE_EX_W'       },
    'wi' :          { 'ilut': 'ND_ILUT_EX_WI',       'size': 2,   'type': 'ND_TABLE_EX_W'       },
    'nd' :          { 'ilut': 'ND_ILUT_EX_ND',       'size': 2,   'type': 'ND_TABLE_EX_ND'      },
    'nf' :          { 'ilut': 'ND_ILUT_EX_NF',       'size': 2,   'type': 'ND_TABLE_EX_NF'      },
    'sc' :          { 'ilut': 'ND_ILUT_EX_SC',       'size': 16,  'type': 'ND_TABLE_EX_SC'      },
}


mnemonics = []
mnemonics_prefix = []

instructions = []
prefixes = []
features = []


def instrux_to_idbe(
    ins: disasmlib.Instruction
    ) -> dict:
    """ 
    Generates a dictionary equivalent to the ND_IDBE structure. Each dictionary key is equivalent 
    to a ND_IDBE structure field. Restrictions:
    - The order of the keys must be identical to the order of fields inside ND_IDBE
    - There must be no gaps; if a field is not used, just initialize it to some default value
    - The names must be identical to field names inside ND_IDBE

    Parameters
    ----------
    ins: disasmlib.Instruction
        The instruction to be converted to a dictionary.

    Returns
    -------
    A dictionary representing the bddisasm C definition of this instrux.
    """
    d = {}

    # Instruction class
    d['Instruction'] = 'ND_INS_' + ins.Class

    # Instruction Category
    d['Category'] = 'ND_CAT_' + ins.Category

    # ISA Set
    d['IsaSet'] = 'ND_SET_' + ins.Set

    # Mnemonic (index)
    d['Mnemonic'] = '%d' % (mnemonics.index(ins.Mnemonic))

    # Accepted prefixes map
    if ins.Prefmap:
        d['ValidPrefixes'] = '|'.join(['ND_PREF_' + x.upper() for x in ins.Prefmap])
    else:
        d['ValidPrefixes'] = '0'

    # Valid modes
    all = True
    smodes = ''
    for m in disasmlib.valid_cpu_modes:
        if m not in ins.Modes:
            all = False
    if all:
        smodes = 'ND_MOD_ANY'
    else:
        smodes = '|'.join(['ND_MOD_' + m.upper() for m in ins.Modes])
    d['ValidModes'] = smodes

    # Valid decorators
    if ins.DecoFlags:
        d['ValidDecorators'] = '|'.join(['ND_DECO_' + x.upper() for x in ins.DecoFlags])
    else:
        d['ValidDecorators'] = '0'

    # Operand count
    d['OpsCount'] = 'ND_OPS_CNT(%d, %d)' % (len(ins.ExpOps), len(ins.ImpOps))

    # EVEX tuple type
    if ins.Evex and ins.Tuple:
        d['TupleType'] = 'ND_TUPLE_' + ins.Tuple.upper()
    else:
        d['TupleType'] = '0'

    # Exception type
    if ins.ExType:
        d['ExcType'] = 'ND_EXT_' + ins.ExType
    else:
        d['ExcType'] = '0'

    # FpuFlags (x87 instructions only)
    if ins.Set == 'X87':
        value = 0
        acc = { '0': 0, '1': 1, 'm': 2, 'u': 3 }
        for i in range(0, 4):
            value |= acc[ins.FpuFlags[i]] << (i * 2)
        d['FpuFlags'] = '0x%02x' % value
    else:
        d['FpuFlags'] = '0'

    # EVEX mode
    if ins.EvexMode:
        d['EvexMode'] = 'ND_EVEXM_' + ins.EvexMode.upper()
    else:
        d['EvexMode'] = '0'

    # Flags (tested, modified, set, cleared)
    for m in ['t', 'm', '1', '0']:
        flg = '0'
        dst = ins.Rflags[m]
        if m == '1' or m == '0':
            dst = dst + ins.Rflags['u']
        for f in dst:
            flg += '|NDR_RFLAG_%s' % f.upper()
        if m == 't': d['TestedFlags'] = flg
        if m == 'm': d['ModifiedFlags'] = flg
        if m == '1': d['SetFlags'] = flg
        if m == '0': d['ClearedFlags'] = flg

    # Instruction attributes
    fs = '|'.join(['ND_FLAG_' + x.upper() for x in ins.Attributes 
                   if x != 'nil' and not x.startswith('OP1') and not x.startswith('OP2')
                                                             and not x.startswith('OP3') and not x.startswith('OP4')
                                                             and not x.startswith('OP5') and not x.startswith('OP6')
                                                             ]) or '0'
    d['Attributes'] = fs

    # CPUID flag
    flg = '0'
    for feat in features:
        if feat.Name == ins.Id:
            flg = 'ND_CFF_%s' % feat.Name
    d['CpuidFlag'] = flg

    # List of instruction operands
    d['Operands'] = []
    for op in ins.ExpOps + ins.ImpOps:
        d['Operands'].append(cdef_operand(op))

    return d

def cdef_operand(
    op: disasmlib.Operand
    ) -> str:
    """ 
    Generates a bddisasm C definition for the current operand.

    Parameters
    ----------
    op: Operand
        The operand to be converted in a C definition.

    Returns
    -------
    A string representing the bddisasm C definition of this operand.

    Example
    -------
    "OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0)"
    """
    return 'OP(%s, %s, %s, %s, %s, %d)' % (
            'ND_OPT_' + op.Type,
            'ND_OPS_' + (op.Size if op.Size != '?' else 'unknown'),
            '|'.join(['ND_OPF_' + x for x in op.Flags]) or '0',
            'ND_OPA_' + op.Access,
            '|'.join(['ND_OPD_' + disasmlib.deco_op_flags[x] for x in op.Decorators]) or 0,
            op.Block)

def cdef_instruction(
    ins: disasmlib.Instruction
    ) -> str:
    """
    Generates a bddisasm C or CPP definition for the current instruction.
    If C style definition is used, designated initializers are used.
    If CPP definition is required, aggregate initialization is used.

    Parameters
    ----------
    ins: Instruction
        The instruction to be converted in a C structure.

    Returns
    -------
    A multi-line string representing the bddisasm C or CPP definition of this instruction.

    Example
    -------
    Designated initializer definition:
    // Pos:3 Instruction:"AADD My,Gy" Encoding:"NP 0x0F 0x38 0xFC /r:mem"/"MR"
    {
        .Instruction = ND_INS_AADD,
        .Category = ND_CAT_RAOINT,
        .IsaSet = ND_SET_RAOINT,
        .Mnemonic = 2,
        .ValidPrefixes = 0,
        .ValidModes = ND_MOD_ANY,
        .ValidDecorators = 0,
        .OpsCount = ND_OPS_CNT(2, 0),
        .TupleType = 0,
        .ExcType = 0,
        .FpuFlags = 0,
        .EvexMode = 0,
        .TestedFlags = 0,
        .ModifiedFlags = 0,
        .SetFlags = 0,
        .ClearedFlags = 0,
        .Attributes = ND_FLAG_NOREX2|ND_FLAG_MODRM,
        .CpuidFlag = ND_CFF_RAOINT,
        .Operands = 
        {
            OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0),
            OP(ND_OPT_G, ND_OPS_y, 0, ND_OPA_R, 0, 0),
        },
    }, 

    Aggregate initializer definition:
    // Pos:3 Instruction:"AADD My,Gy" Encoding:"NP 0x0F 0x38 0xFC /r:mem"/"MR"
    {
        /*      Instruction */ ND_INS_AADD,
        /*         Category */ ND_CAT_RAOINT,
        /*           IsaSet */ ND_SET_RAOINT,
        /*         Mnemonic */ 2,
        /*    ValidPrefixes */ 0,
        /*       ValidModes */ ND_MOD_ANY,
        /*  ValidDecorators */ 0,
        /*         OpsCount */ ND_OPS_CNT(2, 0),
        /*        TupleType */ 0,
        /*          ExcType */ 0,
        /*         FpuFlags */ 0,
        /*         EvexMode */ 0,
        /*      TestedFlags */ 0,
        /*    ModifiedFlags */ 0,
        /*         SetFlags */ 0,
        /*     ClearedFlags */ 0,
        /*       Attributes */ ND_FLAG_NOREX2|ND_FLAG_MODRM,
        /*        CpuidFlag */ ND_CFF_RAOINT,
        /*         Operands */ 
        {
            OP(ND_OPT_M, ND_OPS_y, 0, ND_OPA_RW, 0, 0),
            OP(ND_OPT_G, ND_OPS_y, 0, ND_OPA_R, 0, 0),
        },
    }, 
    """
    idbe = instrux_to_idbe(ins)

    c = ''

    # Start with the position and encoding description.
    c += '    // Pos:%d Instruction:"%s" Encoding:"%s"/"%s"\n' % (
         ins.Icount,
         str(ins),
         ins.RawEnc,
         ''.join([x.Encoding for x in ins.ExpOps]).replace('S', ''))

    c += '    {\n'

    for field in idbe:
        if idbe_format_designated:
            c += '        .%s = ' % field
        else:
            c += '        /* %16s */ ' % field
        
        if type(idbe[field]) is list:
            c += '\n'
            c += '        {\n'
            if len(idbe[field]) == 0:
                c += '                0\n'
            else:
                for entry in idbe[field]:
                    c += '            ' + entry + ',\n'
            c += '        },\n'
        else:
            c += str(idbe[field]) + ',\n'

    c += '    }'

    return c


def compute_index(
    value: str
    ) -> int:
    """ 
    Given a component value, convert it to an index inside te C decoding table. Values which are present inside the 
    component_value_index dict will be translated using that. All other values will be considered to be hex values, 
    so they will be int(value, 16). The returned index is used when decoding an instruction, in order to lookup the 
    next viable entry in the multi-way decode tree.

    Parameters
    ----------
    value: str
        The index to be converted to an integer index.

    Returns
    -------
    An integer representing the index of the given decode component.

    Example
    -------
    Input: 
        value: repz
    Returns:
        4

    Input:
        value: 0xCC
    Returns:
        204
    """
    if value in component_value_index:
        return component_value_index[value]
    return int(value, 16)


def group_find_component(
    instructions: list[disasmlib.Instruction],
    components: list[dict]
    ) -> dict:
    """ 
    Given a list of instructions and a list of decoding components, return the decoding component that covers all the
    instructions in the list. For example, in an initial call to this function for the list of all legacy instructions
    the "opcode" component would be returned.

    Parameters
    ----------
    instructions: list[disasmlib.Instruction]
        The list of instructions to be grouped.
    components:
        The list of components used for grouping.

    Returns
    -------
    A dict representing the decode component that can be used to cover all instructions in the list.
    None if no such component could be found.
    """
    for c in components:
        if c['all']:
            # Some components must be present for all instructions in the list - for example, opcode.
            bad = False
            for i in instructions:
                if not i.Encoding[c['type']]:
                    bad = True
                    break
        else:
            # Optional components need only be present for a single instruction in the list.
            bad = True
            for i in instructions:
                if i.Encoding[c['type']]:
                    bad = False
                    break
        if bad:
            continue
        return c
    return None


def group_instructions(
    instructions: list[disasmlib.Instruction],
    components: list[dict]
    ) -> dict:
    """ 
    Given a list of instructions and a list of decoding components, find the best grouping component, distribute all 
    instructions inside an array of children entries based on the identified grouping component, and recurse for all 
    children entries, until we are left with leaf entries only. A leaf entry is composed of a single Instruction 
    object.

    Parameters
    ----------
    instructions: list[disasmlib.Instruction]
        The list of instructions to be grouped.
    components:
        The list of components used for grouping.

    Returns
    -------
    A dictionary containing two keys: 
        - "component": indicates the component type used for the current grouping. It is one of "components_legacy"
          or "components_ex", depending on which was used for the grouping.
        - "children": an array of N entries, where each entry of the array contains an array of instructions that can 
          be further grouped. The size N of the "children" array is given by the number of possible entries for the 
          given "component". For example, an "opcode" component can have up to 256 values, so "children" will have 
          256 entries. A "modrmreg" component can have up to 8 values, so "children" will have 8 entries.

    Example
    -------
    Consider the following list of (simplified) initial instructions (only opcode and reduced encoding shown):
    [{"I1", "0xBD"}, {"I2", "0xCC"}, {"I4", "FF /1"}, {"I5", "FF /5"}]

    During the first call, "opcode" would be chosen to group the instructions, so we would end up with the following
    result:
    { 
        "component": "opcode",
        "children": [
            ... 
            Pos 0xBD: [{"I1", "0xBD"}],
            ...
            Pos 0xCC: [{"I2", "0xCC"}],
            ...
            Pos 0xFF: [{"I4", "FF /1"}, {"I5", "FF /5"}]
        ]
    }

    We would then recurse for each child in the children array. Note that for opcodes 0xBD and 0xCC, we already have
    leaf entries, so further grouping will not be required.

    For opcode 0xFF, further grouping is needed. At the next step, the "modrmreg" will be chosen for grouping, with
    the following result:
    { 
        "component": "modrmreg",
        "children": [
            Pos 0: []
            Pos 1: [{"I4", "FF /1"}]
            Pos 2: []
            Pos 3: []
            Pos 4: []
            Pos 5: [{"I5", "FF /5"}]
            Pos 6: []
            Pos 7: []
        ]
    }

    As in the previous example, we would recurse for each child, but we are already at leaf entries, so no more grouping
    is required.
    """
    group = {
        'component' : None, # Component type, used to decode children instructions
        'children'  : None, # Array of sub-groups. Each entry is an array of instructions that will be further groupes.
    }

    # Find a good grouping component for the current instruction list.
    comp = group_find_component(instructions, components)

    # If no good component was found, we probably reached a leaf entry.
    if not comp and len(instructions) == 1:
        # Reached leaf entry, no more grouping needed.
        group['component'] = 'leaf'
        group['children'] = instructions[0]
        return group
    elif not comp:
        # No grouping component found for multiple instructions - error.
        print("ERROR: Cannot properly group the following instructions. Please review specs!")
        for i in instructions: print("    -> ", i, " with encoding: ", i.RawEnc)
        raise Exception("Grouping error: invalid/incomplete specification!")

    # Allocate the sub-group array, based on the number of entries in the current group.
    group['component'] = comp['type']
    group['children'] = []
    glen = components_ilut[comp['type']]['size']

    for i in range(0, glen):
        group['children'].append([])

    # Now go through every instruction in the current group, and distribute it on its position.
    # Note that at each grouping step, we pop the used component from the instruction
    # encoding array, so that it's not used again.
    for i in instructions:
        if len(i.Encoding[comp['type']]) > 0:
            index = compute_index(i.Encoding[comp['type']].pop(0))
        else:
            index = 0
        group['children'][index].append(i)

    # Now recurse, and group every sub-group of instructions.
    for i in range(0, glen):
        # Skip empty groups.
        if not group['children'][i]:
            continue

        # Recursively group instructions.
        group['children'][i] = group_instructions(group['children'][i], components)

    return group


def group_dump(
    group: map,
    level: int = 0
    ):
    """
    Dump the entire translation tree identified by the root "group".
    """
    if group['component'] == 'leaf':
        print("    " * level, group['children'])
        return
    for i in range(0, len(group['children'])):
        if not group['children'][i]:
            continue
        print("    " * level, group['component'], '%02x' % i)
        group_dump(group['children'][i], level + 1)


def dump_translation_tables(
    instructions: list[disasmlib.Instruction]
    ):
    """
    Generate the instruction translation trees.
    """
    table_legacy = []
    table_xop    = []
    table_vex    = []
    table_evex   = []

    group_legacy = {}
    group_vex    = {}
    group_xop    = {}
    group_evex   = {}
    
    # Distribute each instruction type into its own table.
    for i in instructions:
        if   i.Vex:  table_vex.append(i)
        elif i.Xop:  table_xop.append(i)
        elif i.Evex: table_evex.append(i)
        else:        table_legacy.append(i)

    #
    # Legacy map.
    #
    group_legacy = group_instructions(table_legacy, components_legacy)
    group_cdef = group_generate_c_table(group_legacy, 'gLegacyMap_%s' % group_legacy['component'])

    print('Writing the bdx86_table_root.h file...')
    with open(r'../bddisasm/include/bdx86_table_root.h', 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_TABLE_ROOT_H\n')
        f.write('#define BDX86_TABLE_ROOT_H\n\n')
        f.write(group_cdef)
        f.write('\n#endif\n\n')


    #
    # VEX map.
    #
    group_vex = group_instructions(table_vex, components_ex)
    group_cdef = group_generate_c_table(group_vex, 'gVexMap_%s' % group_vex['component'])

    print('Writing the bdx86_table_vex.h file...')
    with open(r'../bddisasm/include/bdx86_table_vex.h', 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_TABLE_VEX_H\n')
        f.write('#define BDX86_TABLE_VEX_H\n\n')
        f.write(group_cdef)
        f.write('\n#endif\n\n')


    #
    # XOP map.
    #
    group_xop = group_instructions(table_xop, components_ex)
    group_cdef = group_generate_c_table(group_xop, 'gXopMap_%s' % group_xop['component'])

    print('Writing the bdx86_table_xop.h file...')
    with open(r'../bddisasm/include/bdx86_table_xop.h', 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_TABLE_XOP_H\n')
        f.write('#define BDX86_TABLE_XOP_H\n\n')
        f.write(group_cdef)
        f.write('\n#endif\n\n')


    #
    # EVEX map.
    #
    group_evex = group_instructions(table_evex, components_ex)
    group_cdef = group_generate_c_table(group_evex, 'gEvexMap_%s' % group_evex['component'])

    print('Writing the bdx86_table_evex.h file...')
    with open(r'../bddisasm/include/bdx86_table_evex.h', 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_TABLE_EVEX_H\n')
        f.write('#define BDX86_TABLE_EVEX_H\n\n')
        f.write(group_cdef)
        f.write('\n#endif\n\n')


def group_generate_c_table(
    group: map, 
    name: str
    ) -> str:
    """
    Generate the translation tree, in C format, for the decoding tree identified by group.
    """
    if group['component'] != 'leaf':
        current_table = ''
        current_table += 'const %s %s = \n' % (components_ilut[group['component']]['type'], name)
        current_table += '{\n'
        current_table += '    %s,\n' % components_ilut[group['component']]['ilut']
        current_table += '    {\n'
        for i in range(0, len(group['children'])):
            if not group['children'][i]:
                current_table += '        /* %02x */ (const void *)ND_NULL,\n' % (i)
            else:
                current_name = name + ('_%02x_%s' % (i, group['children'][i]['component']))
                current_table += '        /* %02x */ (const void *)&%s,\n' % (i, current_name)
                current_table = group_generate_c_table(group['children'][i], current_name) + current_table
        current_table += '    }\n'
        current_table += '};\n\n'
        return current_table
    else:
        # Instruction, construct a dummy table that directly points to the instruction.
        res  = 'const ND_TABLE_INSTRUCTION %s = \n' % name
        res += '{\n'
        res += '    ND_ILUT_INSTRUCTION,\n'
        res += '    (const void *)&gInstructions[% 4d]  // %s\n' % (group['children'].Icount, str(group['children']))
        res += '};\n\n'
        return res


def generate_mnemonics(instructions):
    mnemonics = []

    for i in instructions:
        mnemonics.append(i.Mnemonic)

    return sorted(set(mnemonics))

def generate_constants(lst, pre = False):
    constants = []

    for i in lst:
        if pre:
            constants.append('ND_PRE_' + i.Mnemonic)
        else:
            constants.append('ND_INS_' + i.Class)

    return sorted(set(constants))

def generate_constants2(instructions):
    constants_sets, constants_types = [], []

    for i in instructions:
        constants_sets.append('ND_SET_' + i.Set)
        constants_types.append('ND_CAT_' + i.Category)

    return sorted(set(constants_sets)), sorted(set(constants_types))

def dump_mnemonics(mnemonics, prefixes, fname):
    with open(fname, 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_MNEMONICS_H\n')
        f.write('#define BDX86_MNEMONICS_H\n')
        f.write('\n')
        f.write('#ifndef BDDISASM_NO_MNEMONIC\n')
        f.write('\n')
        f.write('const char *gMnemonics[%d] = \n' % len(mnemonics))
        f.write('{\n')
        f.write('    ')

        i = 0
        ln = 0
        for m in mnemonics:
            f.write('"%s", ' % m)
            ln += len(m) + 4
            i += 1
            if ln > 60:
                ln = 0
                f.write('\n    ')


        f.write('\n};\n')

        f.write('\n')
        f.write('#endif // !BDDISASM_NO_MNEMONIC\n')

        f.write('\n\n')

        f.write('#endif\n\n')

def dump_constants(constants, prefixes, constants_sets, constants_types, fname):
    with open(fname, 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_CONSTANTS_H\n')
        f.write('#define BDX86_CONSTANTS_H\n\n')
        f.write('\n')
        f.write('typedef enum _ND_INS_CLASS\n')
        f.write('{\n')
        f.write('    ND_INS_INVALID = 0,\n')

        for c in constants:
            f.write('    %s,\n' % c)

        f.write('\n} ND_INS_CLASS;\n\n\n')

        # Now the instruction sets.
        f.write('typedef enum _ND_INS_SET\n')
        f.write('{\n')
        f.write('    ND_SET_INVALID = 0,\n')
        for c in constants_sets:
            f.write('    %s,\n' % c)
        f.write('\n} ND_INS_SET;\n\n\n')
    
        # Now the instruction types.
        f.write('typedef enum _ND_INS_TYPE\n')
        f.write('{\n')
        f.write('    ND_CAT_INVALID = 0,\n')
        for c in constants_types:
            f.write('    %s,\n' % c)
        f.write('\n} ND_INS_CATEGORY;\n\n\n')
    
        # Done!
        f.write('\n#endif\n')

def dump_master_table(instructions, fname):
    with open(fname, 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_INSTRUCTIONS_H\n')
        f.write('#define BDX86_INSTRUCTIONS_H\n')
        f.write('\n')
        flags = []
        f.write('const ND_IDBE gInstructions[%s] = \n' % len(instructions))
        f.write('{\n')
        for i in instructions:
            f.write('%s, \n\n' % cdef_instruction(i))
        f.write('\n};\n')
        f.write('\n#endif\n')

def dump_features(features, fname):
    with open(fname, 'wt') as f:
        f.write(header)
        f.write('#ifndef BDX86_CPUID_FLAGS_H\n')
        f.write('#define BDX86_CPUID_FLAGS_H\n')

        f.write('\n')
        f.write('#define ND_CFF_NO_LEAF    0xFFFFFFFF\n')
        f.write('#define ND_CFF_NO_SUBLEAF 0x00FFFFFF\n')
        f.write('\n')
        f.write('\n')
        f.write('#define ND_CFF(leaf, subleaf, reg, bit) ((ND_UINT64)(leaf) | ((ND_UINT64)((subleaf) & 0xFFFFFF) << 32) | ((ND_UINT64)(reg) << 56) | ((ND_UINT64)(bit) << 59))\n')

        for c in features:
            f.write('#define ND_CFF_%s%sND_CFF(%s, %s, %s, %s)\n' % (c.Name, ' ' * (25 - len(c.Name)), c.Leaf, c.SubLeaf, 'NDR_' + c.Reg, c.Bit))

        f.write('\n')

        f.write('#endif // CPUID_FLAGS_H\n')

#
# =============================================================================
# Main
# =============================================================================
#
if __name__ == "__main__":
    if len(sys.argv) < 2:
        print('Usage: %s defs-file-dir' % os.path.basename(sys.argv[0]))
        sys.exit(-1)

    # Extract the flags.
    print('Loading flags access templates...')
    disasmlib.parse_flags_file('%s/flags.dat' % sys.argv[1])

    # Extact the CPUID features.
    print('Loading CPUID feature flags templates...')
    features = disasmlib.parse_cff_file('%s/cpuid.dat' % sys.argv[1])

    # Extract the valid modes.
    print('Loading CPU operating modes templates...')
    insmodes = disasmlib.parse_modess_file('%s/modes.dat' % sys.argv[1])

    # Extract the instructions.
    for fn in glob.glob('%s/table*.dat' % sys.argv[1]):
        print('Loading instructions from %s...' % fn)
        instructions = instructions + disasmlib.parse_ins_file(fn)

    # Sort the instructions.
    instructions = sorted(instructions, key = lambda x: x.Mnemonic)
    for i in range(0, len(instructions)):
        instructions[i].Icount = i

    # Generate the mnemonics
    mnemonics = generate_mnemonics(instructions)
    mnemonics_prefixes = generate_mnemonics(prefixes)

    # Generate the constants
    constants = generate_constants(instructions)
    constants_prefixes = generate_constants(prefixes, True)
    constants_sets, constants_types = generate_constants2(instructions)


    #
    # Dump all data to files.
    #

    # Dump the mnemonics
    print('Writing the bdx86_mnemonics.h (instruction mnemonics) file...')
    dump_mnemonics(mnemonics, mnemonics_prefixes, r'../bddisasm/include/bdx86_mnemonics.h')

    # Dump the instruction constants
    print('Writing the bdx86_constants.h (instruction definitions) file...')
    dump_constants(constants, constants_prefixes, constants_sets, constants_types, r'../inc/bdx86_constants.h')

    # Dump the CPUID feature flags.
    print('Writing the bdx86_cpuidflags.h (CPUID feature flags) file...')
    dump_features(features, r'../inc/bdx86_cpuidflags.h')

    # Dump the instruction database.
    print('Writing the bdx86_instructions.h (main instruction database) file...')
    dump_master_table(instructions, r'../bddisasm/include/bdx86_instructions.h')

    # Dump the translation tables.
    print('Writing the translation tables...')
    dump_translation_tables(instructions)

    print('Instruction successfully parsed & header files generated!')