bddisasm/tests/core/objects.py

#
# Copyright (c) 2024 Bitdefender
# SPDX-License-Identifier: Apache-2.0
#
import sys
import re
import json
import string


class UniqDict(dict):
    def __setitem__(self, key: str, value: str) -> type(None):
        if key in self.keys():
            raise ValueError("key is already present : %s" % (key))
        else:
            return super(UniqDict, self).__setitem__(key, value)


class DecodeShemuParser(object):
    def __init__(self, obj: str):
        self._obj = obj
        self._crt = 0
        self._data = UniqDict()

    def rdline(self) -> str:
        """
        Reads the line corresponding to the current index from the string containing the result.
        This function also increment the index to the current line.
        """
        if self._crt == len(self._obj):
            return None

        out = self._obj[self._crt]
        self._crt += 1

        return out

    def rdnline(self) -> str:
        """
        Reads the next line corresponding to the current index from the string containing the result.
        This function does not increment the index to the current line.
        """
        if self._crt == len(self._obj):
            return None

        out = self._obj[self._crt]
        return out


class ShemuResult(DecodeShemuParser):
    def __init__(self, obj: str):
        DecodeShemuParser.__init__(self, obj)

        self.process()

    def process(self) -> type(None):
        """
        Parses the emulation result generated by disasmtool and stores it in a dictionary as key:value pair as follows:
        {
            "Emulation terminated with status": "0x0000000a",
            "flags:": "0x80",
            "NOPs": "0",
            "NULLs": "0",
            "total instructions": "2",
            "unique instructions": "1",
            "SHEMU_FLAG-0": "SHEMU_FLAG_HEAVENS_GATE"
        }
        """
        self._obj = self._obj.split("\n")
        self._obj = list(filter(None, self._obj))
        # Emulation terminated with status 0x0000000a, flags: 0x10, 0 NOPs, 0 NULLs, 10 total instructions, 10 unique instructions
        line = self.rdline()

        tokens = line.split(",")
        tokens = list(filter(None, tokens))
        tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))
        for token in tokens:
            tsplit = token.lstrip().rstrip().split(" ")
            if (
                all(c in string.hexdigits + "x" for c in tsplit[-1])
                or tsplit[-1].isnumeric()
            ):
                val = tsplit[-1]
                key = " ".join(tsplit[:-1])
            else:
                val = tsplit[0]
                key = " ".join(tsplit[1:])
            self._data[key] = val

        line = self.rdline()
        cnt = 0
        while line:
            self._data["SHEMU_FLAG-%s" % (cnt)] = line.lstrip().rstrip()
            cnt += 1
            line = self.rdline()


class ShemuInstrux(DecodeShemuParser):
    def __init__(self, obj: str):
        DecodeShemuParser.__init__(self, obj)

        self.process()

    def process(self) -> type(None):
        """
        Parses the emulation result generated by disasmtool and stores it in a dictionary as key:value pair as follows:
        {
            "AX": "0x0000000000000000",
            "CX": "0x0000000000000000",
            "DX": "0x0000000000000000",
            "BX": "0x0000000000000000",
            "BP": "0x0000000000000000",
            "SI": "0x0000000000000000",
            ...
            "28": "0x0000000000000000",
            "29": "0x0000000000000000",
            "30": "0x0000000000000000",
            "31": "0x0000000000000000",
            "IP": "0x0000000000200000",
            "GS": "0x0000000000000202"
        }
        """
        self._obj = self._obj.split("\n")
        line = self.rdline()
        while line:
            # X0  = 0x0000000000000000 X1  = 0x0000000000000000 X2  = 0x0000000000000000 X3  = 0x0000000000000000
            if " = " in line:
                tokens = re.findall(r"\w\w\s*=\s*0x\d{16}", line)
                for token in tokens:
                    key = token.lstrip().rstrip().split("=")[0].lstrip().rstrip()
                    val = token.lstrip().rstrip().split("=")[1].lstrip().rstrip()
                    self._data[key] = val
            line = self.rdline()


class DecodeInstrux(DecodeShemuParser):
    def __init__(self, obj: str):
        DecodeShemuParser.__init__(self, obj)

        self.process()

    def process(self) -> type(None):
        """
        Parses an instruction generated by disasmtool and stores it in a dictionary as key:value pair as follows:
        {
            "InstructionBytes": "c4e2784900",
            "InstructionText": "LDTILECFG zmmword ptr [rax]",
            "RIP": "0000000000000000",
            "DSIZE": "32",
            "ASIZE": "64",
            "VLEN": "-",
            "ISA Set": "AMX-TILE",
             ...
            "Operand-0": {
                "Operand": "0",
                "Acc": "R-",
                "Type": "Memory",
                "Size": "64",
                "RawSize": "64",
                "Encoding": "M",
                "Segment": "3",
                "Base": "0"
            }
        }
        """
        self._obj = self._obj.split("\n")

        line = self.rdline()
        while line:
            # 0000000000000000 c4e2784900                      LDTILECFG zmmword ptr [rax]
            # 0000000000000000 62                              db 0x62 (0x80000002)
            if re.search("^[0-9A-F]{16}", line):
                tokens = line.split(" ")
                tokens = list(filter(None, tokens))
                tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))
                self._data["InstructionBytes"] = tokens[1].lstrip().rstrip()
                self._data["InstructionText"] = " ".join(tokens[2:]).lstrip().rstrip()
                self._data["RIP"] = tokens[0].lstrip().rstrip()

            # Operand: 0, Acc:  RW,  Type:   Register, Size:  1, RawSize:  1, Encoding: M, RegType:  General Purpose,
            # RegSize:  1, RegId: 22, RegCount: 1
            if "Operand:" in line:
                while self.rdnline() and "Operand:" not in self.rdnline():
                    # if line.endswith(", ") or line.endswith(","):
                    line += self.rdline()

                local = UniqDict()
                tokens = line.split(",")
                tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))
                dc = 0
                for token in tokens:
                    if key == "Decorator":
                        key = "%s-%s" % (key, dc)
                        dc += 1
                    key = token.lstrip().rstrip().split(":")[0].lstrip().rstrip()
                    val = token.lstrip().rstrip().split(":")[1].lstrip().rstrip()
                    local[key] = val

                key = tokens[0].lstrip().rstrip().split(":")[0].lstrip().rstrip()
                val = tokens[0].lstrip().rstrip().split(":")[1].lstrip().rstrip()
                self._data["%s-%s" % (key, val)] = local

            # EVEX Tuple Type: Tuple 1 scalar, 8 bit
            # EVEX Tuple Type: Full
            elif "EVEX" in line:
                tokens = line.split(":")
                key = tokens[0].lstrip().rstrip()
                val = tokens[1].lstrip().rstrip()
                self._data[key] = val

            elif ": " in line:
                tokens = line.split(",")
                tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))
                for token in tokens:
                    key = token.lstrip().rstrip().split(":")[0].lstrip().rstrip()
                    val = token.lstrip().rstrip().split(":")[1].lstrip().rstrip()
                    self._data[key] = val

            line = self.rdline()
Improve bddisasm and bdshemu test scripts. 2024-09-16 09:47:03 +00:00			`#`
			`# Copyright (c) 2024 Bitdefender`
			`# SPDX-License-Identifier: Apache-2.0`
			`#`
			`import sys`
			`import re`
			`import json`
			`import string`


			`class UniqDict(dict):`
			`def __setitem__(self, key: str, value: str) -> type(None):`
			`if key in self.keys():`
			`raise ValueError("key is already present : %s" % (key))`
			`else:`
			`return super(UniqDict, self).__setitem__(key, value)`


			`class DecodeShemuParser(object):`
			`def __init__(self, obj: str):`
			`self._obj = obj`
			`self._crt = 0`
			`self._data = UniqDict()`

			`def rdline(self) -> str:`
			`"""`
			`Reads the line corresponding to the current index from the string containing the result.`
			`This function also increment the index to the current line.`
			`"""`
			`if self._crt == len(self._obj):`
			`return None`

			`out = self._obj[self._crt]`
			`self._crt += 1`

			`return out`

			`def rdnline(self) -> str:`
			`"""`
			`Reads the next line corresponding to the current index from the string containing the result.`
			`This function does not increment the index to the current line.`
			`"""`
			`if self._crt == len(self._obj):`
			`return None`

			`out = self._obj[self._crt]`
			`return out`


			`class ShemuResult(DecodeShemuParser):`
			`def __init__(self, obj: str):`
			`DecodeShemuParser.__init__(self, obj)`

			`self.process()`

			`def process(self) -> type(None):`
			`"""`
			`Parses the emulation result generated by disasmtool and stores it in a dictionary as key:value pair as follows:`
			`{`
			`"Emulation terminated with status": "0x0000000a",`
			`"flags:": "0x80",`
			`"NOPs": "0",`
			`"NULLs": "0",`
			`"total instructions": "2",`
			`"unique instructions": "1",`
			`"SHEMU_FLAG-0": "SHEMU_FLAG_HEAVENS_GATE"`
			`}`
			`"""`
			`self._obj = self._obj.split("\n")`
			`self._obj = list(filter(None, self._obj))`
			`# Emulation terminated with status 0x0000000a, flags: 0x10, 0 NOPs, 0 NULLs, 10 total instructions, 10 unique instructions`
			`line = self.rdline()`

			`tokens = line.split(",")`
			`tokens = list(filter(None, tokens))`
			`tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))`
			`for token in tokens:`
			`tsplit = token.lstrip().rstrip().split(" ")`
			`if (`
			`all(c in string.hexdigits + "x" for c in tsplit[-1])`
			`or tsplit[-1].isnumeric()`
			`):`
			`val = tsplit[-1]`
			`key = " ".join(tsplit[:-1])`
			`else:`
			`val = tsplit[0]`
			`key = " ".join(tsplit[1:])`
			`self._data[key] = val`

			`line = self.rdline()`
			`cnt = 0`
			`while line:`
			`self._data["SHEMU_FLAG-%s" % (cnt)] = line.lstrip().rstrip()`
			`cnt += 1`
			`line = self.rdline()`


			`class ShemuInstrux(DecodeShemuParser):`
			`def __init__(self, obj: str):`
			`DecodeShemuParser.__init__(self, obj)`

			`self.process()`

			`def process(self) -> type(None):`
			`"""`
			`Parses the emulation result generated by disasmtool and stores it in a dictionary as key:value pair as follows:`
			`{`
			`"AX": "0x0000000000000000",`
			`"CX": "0x0000000000000000",`
			`"DX": "0x0000000000000000",`
			`"BX": "0x0000000000000000",`
			`"BP": "0x0000000000000000",`
			`"SI": "0x0000000000000000",`
			`...`
			`"28": "0x0000000000000000",`
			`"29": "0x0000000000000000",`
			`"30": "0x0000000000000000",`
			`"31": "0x0000000000000000",`
			`"IP": "0x0000000000200000",`
			`"GS": "0x0000000000000202"`
			`}`
			`"""`
			`self._obj = self._obj.split("\n")`
			`line = self.rdline()`
			`while line:`
			`# X0 = 0x0000000000000000 X1 = 0x0000000000000000 X2 = 0x0000000000000000 X3 = 0x0000000000000000`
			`if " = " in line:`
			`tokens = re.findall(r"\w\w\s=\s0x\d{16}", line)`
			`for token in tokens:`
			`key = token.lstrip().rstrip().split("=")[0].lstrip().rstrip()`
			`val = token.lstrip().rstrip().split("=")[1].lstrip().rstrip()`
			`self._data[key] = val`
			`line = self.rdline()`


			`class DecodeInstrux(DecodeShemuParser):`
			`def __init__(self, obj: str):`
			`DecodeShemuParser.__init__(self, obj)`

			`self.process()`

			`def process(self) -> type(None):`
			`"""`
			`Parses an instruction generated by disasmtool and stores it in a dictionary as key:value pair as follows:`
			`{`
			`"InstructionBytes": "c4e2784900",`
			`"InstructionText": "LDTILECFG zmmword ptr [rax]",`
			`"RIP": "0000000000000000",`
			`"DSIZE": "32",`
			`"ASIZE": "64",`
			`"VLEN": "-",`
			`"ISA Set": "AMX-TILE",`
			`...`
			`"Operand-0": {`
			`"Operand": "0",`
			`"Acc": "R-",`
			`"Type": "Memory",`
			`"Size": "64",`
			`"RawSize": "64",`
			`"Encoding": "M",`
			`"Segment": "3",`
			`"Base": "0"`
			`}`
			`}`
			`"""`
			`self._obj = self._obj.split("\n")`

			`line = self.rdline()`
			`while line:`
			`# 0000000000000000 c4e2784900 LDTILECFG zmmword ptr [rax]`
			`# 0000000000000000 62 db 0x62 (0x80000002)`
			`if re.search("^[0-9A-F]{16}", line):`
			`tokens = line.split(" ")`
			`tokens = list(filter(None, tokens))`
			`tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))`
			`self._data["InstructionBytes"] = tokens[1].lstrip().rstrip()`
			`self._data["InstructionText"] = " ".join(tokens[2:]).lstrip().rstrip()`
			`self._data["RIP"] = tokens[0].lstrip().rstrip()`

			`# Operand: 0, Acc: RW, Type: Register, Size: 1, RawSize: 1, Encoding: M, RegType: General Purpose,`
			`# RegSize: 1, RegId: 22, RegCount: 1`
			`if "Operand:" in line:`
			`while self.rdnline() and "Operand:" not in self.rdnline():`
			`# if line.endswith(", ") or line.endswith(","):`
			`line += self.rdline()`

			`local = UniqDict()`
			`tokens = line.split(",")`
			`tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))`
			`dc = 0`
			`for token in tokens:`
			`if key == "Decorator":`
			`key = "%s-%s" % (key, dc)`
			`dc += 1`
			`key = token.lstrip().rstrip().split(":")[0].lstrip().rstrip()`
			`val = token.lstrip().rstrip().split(":")[1].lstrip().rstrip()`
			`local[key] = val`

			`key = tokens[0].lstrip().rstrip().split(":")[0].lstrip().rstrip()`
			`val = tokens[0].lstrip().rstrip().split(":")[1].lstrip().rstrip()`
			`self._data["%s-%s" % (key, val)] = local`

			`# EVEX Tuple Type: Tuple 1 scalar, 8 bit`
			`# EVEX Tuple Type: Full`
			`elif "EVEX" in line:`
			`tokens = line.split(":")`
			`key = tokens[0].lstrip().rstrip()`
			`val = tokens[1].lstrip().rstrip()`
			`self._data[key] = val`

			`elif ": " in line:`
			`tokens = line.split(",")`
			`tokens = list(filter(lambda item: item.lstrip().rstrip(), tokens))`
			`for token in tokens:`
			`key = token.lstrip().rstrip().split(":")[0].lstrip().rstrip()`
			`val = token.lstrip().rstrip().split(":")[1].lstrip().rstrip()`
			`self._data[key] = val`

			`line = self.rdline()`