Open sporniket opened 2 years ago
"""
draft assembly 68k parser
"""
import re
import sys
from enum import Enum
class BasicFragmentType(Enum):
WORD = 0
WHITE = 1
SPECIAL = 2
class BasicFragment():
def __init__(self, src):
self.src = src
self.type = BasicFragmentType.WHITE if re.match("\s+", src) else BasicFragmentType.SPECIAL if re.match("([^0-9A-Za-z_])", src) else BasicFragmentType.WORD
def __str__(self):
return f"('{self.src}', {self.type.name})"
class LineOfCode():
def __init__(self, src, fragments):
self.src = src
self.fragments = fragments
label=""
fragmentIndex = 0
while fragments[fragmentIndex].type != BasicFragmentType.WHITE:
if fragments[fragmentIndex].type == BasicFragmentType.SPECIAL and not fragments[fragmentIndex].src in (".","\\","@"):
break
label+= fragments[fragmentIndex].src
fragmentIndex += 1
self.label = label if len(label) > 0 else None
def __str__(self):
f = [f"{f}" for f in self.fragments]
lb = f"Label = {self.label}\n" if self.label is not None else "No label\n"
return f"=====[Line of code]=====\n{self.src}\n{lb}{f}\n---- ---- ---- ----"
class Parser():
def parseLine(self, src:str)->LineOfCode:
fragments = []
for preFragment in re.split("(\s+)", src):
if (re.match("^\s+$", preFragment)):
fragments+=[preFragment]
else:
fragments+=[f for f in re.split("([^0-9A-Za-z_])",preFragment) if len(f)>0]
return LineOfCode(src, [BasicFragment(f) for f in fragments])
def test_Parser_parseLine_can_find_label_when_there_is_one():
loc = Parser().parseLine("DestAssetFont0: dc.b 'font_0.dat',0")
print(loc)
assert loc.label == "DestAssetFont0"
loc = Parser().parseLine(".nextEntry\@ _Setcolor d5,#-1")
print(loc)
assert loc.label == ".nextEntry\@"
def test_Parser_parseLine_does_not_set_label_when_there_is_none():
loc = Parser().parseLine(" moveq #0,d5")
print(loc)
assert loc.label == None
def run(test):
print(f"----- {test} -----")
test()
if __name__ == "__main__":
if len(sys.argv) == 1:
print(f"=====[ START SELF TESTS ]=====")
run(test_Parser_parseLine_can_find_label_when_there_is_one)
run(test_Parser_parseLine_does_not_set_label_when_there_is_none)
else:
prsr = Parser()
for file in sys.argv[1:]:
with open(file) as f:
lines = f.readlines()
for line in lines:
print(prsr.conditionLine(line))
Decompose by field
"""
---
(c) 2022 David SPORN
---
This is part of Sporniket's "experiments" project.
Sporniket's "experiments" project is free software: you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License as published by the
Free Software Foundation, either version 3 of the License, or (at your option)
any later version.
Sporniket's "experiments" project is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License along with Sporniket's "experiments" project.
If not, see <https://www.gnu.org/licenses/>.
---
"""
class FieldOfAs68kLineOfCode:
def __init__(self, source):
self.source = source
class As68kLineOfCode:
def __init__(self):
self.fields = {
"label": None,
"mnemonic": None,
"operand": None,
"comment": None,
}
class As68kLineParser:
def parseLine(self, source):
result = As68kLineOfCode()
accumulator = ""
field = 0
isInsideString = False
stringMarker = None
for char in source:
if isInsideString:
accumulator += char
if char == stringMarker:
isInsideString = False
continue
if field == 2 and char in ('"', "'"):
stringMarker = char
isInsideString = True
accumulator += char
continue
if field < 3 and char in (" ", "\t"):
if field == 1 and accumulator.endswith(":"):
field = 0
if len(accumulator) > 0:
if field == 0:
result.fields["label"] = FieldOfAs68kLineOfCode(accumulator)
elif field == 1:
result.fields["mnemonic"] = FieldOfAs68kLineOfCode(accumulator)
elif field == 2:
result.fields["operand"] = FieldOfAs68kLineOfCode(accumulator)
field += 1
accumulator = ""
continue
if len(accumulator) == 0 and char in ("*", ";"):
field = 3
accumulator += char
if field == 3 and len(accumulator) > 0:
result.fields["comment"] = FieldOfAs68kLineOfCode(accumulator)
return result
def test_should_support_4_fields():
source = "label move.l d0,(a0)+ move data to memory"
parsedLine = As68kLineParser().parseLine(source)
assert parsedLine.fields["label"].source == "label"
assert parsedLine.fields["mnemonic"].source == "move.l"
assert parsedLine.fields["operand"].source == "d0,(a0)+"
assert parsedLine.fields["comment"].source == "move data to memory"
def test_should_support_label_after_whitespaces():
source = " label: move.l d0,(a0)+ move data to memory"
parsedLine = As68kLineParser().parseLine(source)
assert parsedLine.fields["label"].source == "label:"
assert parsedLine.fields["mnemonic"].source == "move.l"
assert parsedLine.fields["operand"].source == "d0,(a0)+"
assert parsedLine.fields["comment"].source == "move data to memory"
def test_should_support_line_without_label():
source = " move.l d0,(a0)+ move data to memory"
parsedLine = As68kLineParser().parseLine(source)
assert parsedLine.fields["label"] == None
assert parsedLine.fields["mnemonic"].source == "move.l"
assert parsedLine.fields["operand"].source == "d0,(a0)+"
assert parsedLine.fields["comment"].source == "move data to memory"
def test_should_support_comment_lines():
sources = [
"* a comment line",
" * a comment line",
"; a comment line",
" ; a comment line",
]
for source in sources:
parsedLine = As68kLineParser().parseLine(source)
assert parsedLine.fields["comment"].source == source.lstrip()
def test_should_support_string_litteral_with_spaces():
source = ' dc.b "hello world",0 define C string'
parsedLine = As68kLineParser().parseLine(source)
assert parsedLine.fields["label"] == None
assert parsedLine.fields["mnemonic"].source == "dc.b"
assert parsedLine.fields["operand"].source == '"hello world",0'
assert parsedLine.fields["comment"].source == "define C string"
First goal : convert a single line (e.g. Add.b (a5),d2
) into a stream of bytes (e.g. 1101.0100,0001.0101
) ; case insensitive
interface of the converter :
def convert(mnemonic, *, operand1=null, operand2=null):
return {
bytes: [], # list of bytes
relocatable: [
{offset:2, length:4, symbol:"what"},
{offset:6, length:2, symbol: "ever"}
],
pretty_print: ["add.b","d0","d2"] # each part cannonically formatted (correct form of the mnemonics, lowercase,...)
}
<instruction>[.<suffix>]
def elaborate(suffix="", *, operand1=null, operand2=null):
# do stuff
return something
A source file contains preprocessing directives (equates, static symbols –e.g. rs.b/w/l...–, macros, includes,...)
TO BE CONTINUED
Manual test
create a single line source containing the reference input, let's say the file is named
add_b.s
:add.b (a5),d2
Use vasm :
vasmm68k_mot add_b.s -Fbin
Then
a.out
contains the byte stream reference$ hd a.out 00000000 d4 15 |..| 00000002
lineOfCode
of a list of supported usage of an instruction
byteStream
lineOfCode
, then the output byte stream is byteStream
byteStream
, then the output decompiled line of code is lineOfCode
Compare with the output obtained with GNU as
. A conversion of each instruction to follow the syntax understood by as
MUST be performed. If an expression is not convertible, it MUST be notified. If the output differ from the output of vasm
, it MUST be notified.
When as
and vasm
do not agree, a case by case analysis is performed, that will most likely end up with a test case. At the very least, verify that the offending byte streams represents the single instruction expression under test.
PRM : Programmer's Reference Manual
One word = a 16bits value, big endian
From section 2.1 of the PRM
Upon reading the programmer reference manual, one notice that the opcode bits are organized like that : 1111.222.333.444.555
. In other words, an hexadecimal digit followed by 4 octal digits. Especially, the addressing mode is specified by the 2 last octal digits.
In a format specification, individual bits are specified, e.g. 1101.rrr.ooo.eee.aaa
for add <ea>,dy
or add dx,<ea>
. '1' and '0' are bits that have a fixed value. A lettered bit is grouped in the subdigit field where it is contained. E.g rrr
means that r can take a value between 0 and 7.
In a byte stream, the opcode will be represented as an hexadecimal digit, followed by a dot, followed by 4 octal digits. E.g. Add.b (a5),d2
that compile into the byte stream1101.0100,0001.0101
will be represented with : d.2025
The otherwords do not use octal aligned subfields, thus the classical hexadecimal representation will be enough, either
TODO
e.g. :
x.yyyy[zzzz ..../ssss ....>dddd ....
Given a typical line of assembly code like :
with_label: symbol_to_resolve [operands] [comment]
;without label
symbol_to_resolve [operands] [comment]
symbol_to_resolve
will be looked up from those sets, in that specified order :
vasm is quite fine tool, except for the licence terms that prevents freedom #0 .
I should get the same binary from sporny-wrecking-ball when using vasm or my compiler