Really awesome tool! Thanks so much for offering it!
Is your tool request related to a problem? Please describe.
Many bioassembly cif files have chain IDs like A-2, A-3, B-1 etc.. This will not work for converting to PDB file, since PDB only takes single character.
I edited the code so that it would relabel the chain id, successfully convert the cif file to pdb file, and also output a json file with chain mapping.
def convert_to_pdb(fhandle, output_prefix):
"""Converts a structure in mmCIF format to PDB format and saves chain ID mapping."""
_a = "{:<6s}{:>5d} {:<4s}{:1s}{:<3s} {:1s}{:>4d}{:1s} {:>8.3f}{:>8.3f}{:>8.3f}"
_a += "{:>6.2f}{:>6.2f} {:<4s}{:<2s}{:2s}\n"
in_section, read_atom = False, False
label_pos = 0
labels = {}
empty = set(('.', '?'))
prev_model = None
atom_num = 0
serial = 0 # Do not read serial numbers from mmCIF; they may be incorrect.
# Initialize the chain ID mapping
chain_id_map = OrderedDict() # Original chain ID -> new single-character chain ID
chain_id_counter = 0
allowed_chain_ids = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz')
model_data = [] # Store atom data for multi-model files
for line in fhandle:
if line.startswith('loop_'): # Start of a section
in_section = True
elif line.startswith('#'): # End of a section
in_section = False
read_atom = False
elif in_section and line.startswith('_atom_site.'): # ATOM/HETATM labels
read_atom = True
labels[line.strip()] = label_pos
label_pos += 1
elif read_atom and line.startswith(('ATOM', 'HETATM')): # Process atom lines
fields = re.findall(r'[^"\s]\S*|".+?"', line) # Handle quoted strings
# Handle model number
model_no = fields[labels.get('_atom_site.pdbx_PDB_model_num')]
if prev_model != model_no:
prev_model = model_no
model_data.append([])
serial = 0
record = fields[labels.get('_atom_site.group_PDB')]
serial += 1
# Atom name
fid = labels.get('_atom_site.auth_atom_id') or labels.get('_atom_site.label_atom_id')
atname = fields[fid].strip('"')
element = fields[labels.get('_atom_site.type_symbol')]
if element in empty:
element = ' '
if len(atname.strip()) < 4 and atname[0].isalpha() and len(element.strip()) < 2:
atname = ' ' + atname # Pad atom name
altloc = fields[labels.get('_atom_site.label_alt_id')]
if altloc in empty:
altloc = ' '
# Residue name
fid = labels.get('_atom_site.auth_comp_id') or labels.get('_atom_site.label_comp_id')
resname = fields[fid]
# Original chain ID
fid = labels.get('_atom_site.auth_asym_id') or labels.get('_atom_site.label_asym_id')
original_chainid = fields[fid]
# Map to a single-character chain ID
if original_chainid not in chain_id_map:
if chain_id_counter >= len(allowed_chain_ids):
emsg = 'ERROR!! Too many unique chains; ran out of chain IDs to assign.\n'
sys.stderr.write(emsg)
sys.exit(1)
new_chainid = allowed_chain_ids[chain_id_counter]
chain_id_map[original_chainid] = new_chainid
chain_id_counter += 1
else:
new_chainid = chain_id_map[original_chainid]
chainid = new_chainid
# Residue number
fid = labels.get('_atom_site.auth_seq_id') or labels.get('_atom_site.label_seq_id')
resnum = int(fields[fid])
icode = fields[labels.get('_atom_site.pdbx_PDB_ins_code')]
if icode in empty:
icode = ' '
x = float(fields[labels.get('_atom_site.Cartn_x')])
y = float(fields[labels.get('_atom_site.Cartn_y')])
z = float(fields[labels.get('_atom_site.Cartn_z')])
occ = float(fields[labels.get('_atom_site.occupancy')])
bfactor = float(fields[labels.get('_atom_site.B_iso_or_equiv')])
charge = fields[labels.get('_atom_site.pdbx_formal_charge')]
if charge in empty:
charge = ' '
segid = original_chainid[:4] # Optionally store original chain ID in segID (max 4 chars)
atom_line = _a.format(record, serial, atname, altloc, resname,
chainid, resnum, icode, x, y, z, occ, bfactor,
segid, element, charge)
atom_num += 1
# Check for PDB format limitations
if atom_num > 99999:
emsg = f"ERROR!! Number of atoms exceeds PDB format limit: '{atom_num}'\n"
sys.stderr.write(emsg)
sys.exit(1)
elif resnum > 9999:
emsg = f"ERROR!! Residue number exceeds PDB format limit: '{resnum}'\n"
sys.stderr.write(emsg)
sys.exit(1)
model_data[-1].append(atom_line)
# Write the chain ID mapping to a JSON file
json_filename = f"{output_prefix}_chain_id_map.json"
with open(json_filename, 'w') as json_file:
json.dump(chain_id_map, json_file, indent=4)
# Inform the user
sys.stderr.write(f"Chain ID mapping saved to {json_filename}\n")
# Write the output
is_ensemble = len(model_data) > 1
if is_ensemble:
for model_no, model in enumerate(model_data, start=1):
yield f"MODEL {model_no:>4d}\n"
for line in model:
yield line
yield 'ENDMDL\n'
else:
for line in model_data[0]:
yield line
yield "END\n"
Really awesome tool! Thanks so much for offering it!
Is your tool request related to a problem? Please describe. Many bioassembly cif files have chain IDs like A-2, A-3, B-1 etc.. This will not work for converting to PDB file, since PDB only takes single character.
I edited the code so that it would relabel the chain id, successfully convert the cif file to pdb file, and also output a json file with chain mapping.