Open Spacefish opened 2 years ago
Thanks for all these references!
The existing code is just a rough translation of Aston's REG implementation into Rust so it's really helpful to have more references. The main issue (as far as I remember it) is that the different data objects can reference each other and entab normally processes data in a linear stream so all the sections will need to be read into memory first (and different sections can have different data types so there's so type coercion that may need to happen).
I've stumbled upon this package. Entab only seems to be able to open a very specific, old reg file format from Agilent. Newer ChemStation versions (with the same magic byte sequence at the beginning) are more verbose.
I've attached some example files from the same Method with a pressure export.
To follow up on this: Since I really needed that extraction for pressure survaillance, I've spend quite some time trying to understand this file format. I cannot make you a PR as I lack Rust knowledge, but this script extracts some of the data (xy-traces as well as the current state of the modules):
import struct
import numpy as np
DEBUG = False
def PascalStringInt8Ascii(stream):
size, *_ = struct.unpack("<B", stream.read(1))
pascal_string, *_ = struct.unpack(f">{size}s", stream.read(size*1))
pascal_string = pascal_string.decode("ascii")
return size, pascal_string
def PascalStringInt16Ascii(stream):
size, *_ = struct.unpack("<H", stream.read(2))
pascal_string, *_ = struct.unpack(f">{size}s", stream.read(size))
pascal_string = pascal_string.decode("ascii")
return size, pascal_string
def PascalStringInt16Utf16(stream):
if type(stream) == bytes:
size = struct.unpack("<H", stream[:2])[0]
pascal_string = stream[2:2+size*2].decode("utf-16")
else:
size, *_ = struct.unpack("<H", stream.read(2))
pascal_string, *_ = struct.unpack(f">{size*2}s", stream.read(size*2))
pascal_string = pascal_string.decode("utf-16")
return size, pascal_string
def UInt8(stream):
integer, *_ = struct.unpack("<B", stream.read(1))
return integer
def UInt16(stream):
integer, *_ = struct.unpack("<H", stream.read(2))
return integer
class FileClass:
header = {}
parts = []
def parse_ndr_string(string):
title = PascalStringInt16Utf16(string)[1]
values = {}
while True:
pair_separator = string.read(2)
match pair_separator:
case b"\t\x80":
field_name = PascalStringInt16Utf16(string)[1]
field_value_separator = string.read(6) # Must be 0xFFFF 0000 00
field_value = struct.unpack("<d", string.read(8))[0]
case b"\x0b\x80":
# In some parts, 0x0b80 indicates string / string (in presence of \t\x80), in others, 0x0b80 indicated string / double (in presence if \r\x80)
field_name = PascalStringInt16Utf16(string)[1]
field_value_separator = string.read(6) # Must be 0xFFFF 0000 00
rollback = string.tell()
lookahead = string.read(4)
string.seek(rollback)
size, field_value = PascalStringInt16Utf16(string)
#print("Size", size, field_name, field_value)
#print("lookahead", lookahead[:2].hex(), lookahead[2:])
if size == 0:
if len(lookahead) == 3:
pass
elif lookahead[:2] == b"\x00\x00" and lookahead[2:] in [b"\x00", b"\t\x80", b"\x0b\x80", b"\r\x80"]:
pass
else:
string.seek(rollback)
field_value = struct.unpack("<d", string.read(8))[0]
case b"\r\x80":
field_name = PascalStringInt16Utf16(string)[1]
field_value_separator = string.read(6) # Must be 0xFFFF 0000 00
field_value = PascalStringInt16Utf16(string)[1]
case b"":
# Reached EOF
break
case b"\x01":
break
case b"\x00":
break
case _:
print("Unknown pair_separator", pair_separator) if DEBUG else None
values[field_name] = field_value
#print(field_name, field_value)
return (title, values)
float64 = np.dtype(np.float64)
float64 = float64.newbyteorder("<")
part_separator = "040000000700010000003e00FFFF1101"
part_separator = "040000000700010000003e00"
with open(r"E:\LCMS\BASILIUS 2023-10-05 14-26-41\002-P2-A1-BS513 F6.D\lcdiag.reg", "rb") as fb:
buffer = fb.read()
reg_parts = buffer.split(bytes.fromhex(part_separator))
File = FileClass()
for i_part, part in enumerate(reg_parts):
part_buf = io.BytesIO(part)
if i_part == 0:
#print(part.hex())
File.header["magic"] = part_buf.read(4)
_, File.header["format"] = PascalStringInt8Ascii(part_buf)
part_buf.seek(0x18)
_, File.header["version"] = PascalStringInt8Ascii(part_buf)
part_buf.seek(0x26)
File.header["n_parts"] = UInt16(part_buf)
else:
print("Part", i_part) if DEBUG else None
File.parts.append({
"data": None,
"???": [],
"trace": None,
})
match part_buf.read(4):
case b'\xff\xff\x11\x01':
sections = re.split(b"\xff\xff\x11\x00|\xff\xff\x11\x01|\xff\xff\x11\x02|\xff\xff\x11\x11|\xff\xff\x21\x00", part)[1:]
for section in sections:
section_buf = io.BytesIO(section)
#print(section[:16])
size, section_name = PascalStringInt16Ascii(section_buf)
section = section[2+size:]
print("\t", section_name) if DEBUG else None
match section_name:
case "CHPUserObject":
pass
case "CHPList":
pass
case "CHPNdrDouble":
unpacked = struct.unpack("<iid", section[-16:])
# d is always 1.0; No clue what they mean
case "CHPNdrString":
header = section_buf.read(10)
unpacked = parse_ndr_string(section_buf)
File.parts[i_part-1]["data"] = unpacked
print("\t", unpacked) if DEBUG else None
case "CHPNdrObject":
field_name = PascalStringInt16Utf16(section_buf)
field_separator = section_buf.read(6)
field_value = section_buf.read().hex()
File.parts[i_part-1]["???"].append((field_name, field_value))
case "CHPTable":
# Very unsure about this
# Not what I'm interested in right now
# There seem to be grouping (C001, C002, C003, C004, C005) which are in some files used, in some files not completely
# Some of the records are separated by 16, 1C or 1D and don't use Pascal string formatting, but is plain utf16
File.parts[i_part-1]["???"].append(("CHPTable", [
section_buf.read().hex()
]))
case "CHPAnnText":
blocks = section.split(bytes.fromhex("FFFFFF80"))
text = PascalStringInt16Utf16(blocks[1])[1]
File.parts[i_part-1]["title"] = text
case "CHPDatDoubleSliced":
# Not sure what this means
# Comes in my files always before CHPDatDoubleRow, might be related to that?
File.parts[i_part-1]["???"].append(("CHPDatDoubleSliced", [
section_buf.read().hex()
]))
case "CHPDatDoubleRow":
# Contains y, x data in two blocks with units
blocks = section.split(bytes.fromhex("00"*7 + "FF" * 7))
y_unit = PascalStringInt16Utf16(blocks[1])[1]
y_data_offset = 2 + len(y_unit)*2 + 4
y_size = struct.unpack("<I", blocks[1][y_data_offset:y_data_offset+4])[0]
y_block = {
"unit": y_unit,
"size": y_size,
"data": np.frombuffer(blocks[1][y_data_offset+4+1:y_data_offset+4+y_size*8+1], dtype=float64)
}
x_unit = PascalStringInt16Utf16(blocks[2])[1]
x_data_offset = 2 + len(x_unit)*2 + 4
x_size = struct.unpack("<I", blocks[2][x_data_offset:x_data_offset+4])[0]
x_block = {
"unit": x_unit,
"size": x_size,
"data": np.frombuffer(blocks[2][x_data_offset+4+1:x_data_offset+4+x_size*8+1], dtype=float64)
}
File.parts[i_part-1]["trace"] = (x_block, y_block)
case _:
print("unsupported type", section_name) if DEBUG else None
case _:
raise Exception("Not supported")
print("---------\n\n") if DEBUG else None
I'm heavily struggling with the information of the CHPTable
though.
Thank you so much for the sample files! The files I have (from 10-ish years ago) start with ".32..REGISTER FILE.......A.00.01" and it looks like the new ones start with ".32..REGISTER FILE.......notused". At a first glance, the new format looks kind of like Thermo CF/DXFs (also CArchives, I think) so I can potentially re-use/re-factor some of that code; I'm not sure about section ordering (also an issue for the old files) requiring the parser to buffer all the information first before it can be reassembled, but maybe that's changed with the new format(s).
The CHPTable
part is interesting. Not sure what's going on, but it appears to be some kind of 242-byte data header, the number 1 in u32, 414 bytes of 1C/1D/16 delimited text, a 38-byte "Description" block, 190 bytes of Pascal-like strings, and then the C002, C003, C004 pattern you mentioned. I think there's some record pattern starting with 07
in the header? In both of the files you sent, there appears to be a 27-byte block (starts with 0107
right after CHPTable
) and then there's a repeating 18-byte pattern that starts with 07
and sometimes ends with a f64 (only in your sample1 file; 426.5, 456.5, 461.5, 444.0). This stops ~90-ish bytes before the delimited text appears and I'd guess it's either encoding data or references to the data elsewhere?
I believe the Core of the Chemstation .reg format is a CArchive https://learn.microsoft.com/en-us/cpp/mfc/reference/carchive-class?view=msvc-170 here is a reader/writer implemented in cpp: https://github.com/pixelspark/corespark/blob/e2aa78fe13e273fcc9bb2665ab4c700e89895741/Libraries/atlmfc/src/mfc/arcobj.cpp
The "data type numbers" are random / depend on the order in which the different data objects are written to the archive. It get´s a number once it´s written to the file the first time. See this code: https://github.com/pixelspark/corespark/blob/e2aa78fe13e273fcc9bb2665ab4c700e89895741/Libraries/atlmfc/src/mfc/arcobj.cpp#L259