haddocking / pdb-tools

A dependency-free cross-platform swiss army knife for PDB files.
https://haddocking.github.io/pdb-tools/
Apache License 2.0
390 stars 114 forks source link

chain renaming when converting cif to pdb #167

Open zzhangzzhang opened 3 weeks ago

zzhangzzhang commented 3 weeks ago

Really awesome tool! Thanks so much for offering it!

Is your tool request related to a problem? Please describe. Many bioassembly cif files have chain IDs like A-2, A-3, B-1 etc.. This will not work for converting to PDB file, since PDB only takes single character.

I edited the code so that it would relabel the chain id, successfully convert the cif file to pdb file, and also output a json file with chain mapping.

def convert_to_pdb(fhandle, output_prefix):
    """Converts a structure in mmCIF format to PDB format and saves chain ID mapping."""

    _a = "{:<6s}{:>5d} {:<4s}{:1s}{:<3s} {:1s}{:>4d}{:1s}   {:>8.3f}{:>8.3f}{:>8.3f}"
    _a += "{:>6.2f}{:>6.2f}      {:<4s}{:<2s}{:2s}\n"

    in_section, read_atom = False, False

    label_pos = 0
    labels = {}
    empty = set(('.', '?'))

    prev_model = None
    atom_num = 0
    serial = 0  # Do not read serial numbers from mmCIF; they may be incorrect.

    # Initialize the chain ID mapping
    chain_id_map = OrderedDict()  # Original chain ID -> new single-character chain ID
    chain_id_counter = 0
    allowed_chain_ids = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz')

    model_data = []  # Store atom data for multi-model files

    for line in fhandle:
        if line.startswith('loop_'):  # Start of a section
            in_section = True

        elif line.startswith('#'):  # End of a section
            in_section = False
            read_atom = False

        elif in_section and line.startswith('_atom_site.'):  # ATOM/HETATM labels
            read_atom = True
            labels[line.strip()] = label_pos
            label_pos += 1

        elif read_atom and line.startswith(('ATOM', 'HETATM')):  # Process atom lines
            fields = re.findall(r'[^"\s]\S*|".+?"', line)  # Handle quoted strings

            # Handle model number
            model_no = fields[labels.get('_atom_site.pdbx_PDB_model_num')]
            if prev_model != model_no:
                prev_model = model_no
                model_data.append([])
                serial = 0

            record = fields[labels.get('_atom_site.group_PDB')]
            serial += 1

            # Atom name
            fid = labels.get('_atom_site.auth_atom_id') or labels.get('_atom_site.label_atom_id')
            atname = fields[fid].strip('"')

            element = fields[labels.get('_atom_site.type_symbol')]
            if element in empty:
                element = ' '

            if len(atname.strip()) < 4 and atname[0].isalpha() and len(element.strip()) < 2:
                atname = ' ' + atname  # Pad atom name

            altloc = fields[labels.get('_atom_site.label_alt_id')]
            if altloc in empty:
                altloc = ' '

            # Residue name
            fid = labels.get('_atom_site.auth_comp_id') or labels.get('_atom_site.label_comp_id')
            resname = fields[fid]

            # Original chain ID
            fid = labels.get('_atom_site.auth_asym_id') or labels.get('_atom_site.label_asym_id')
            original_chainid = fields[fid]

            # Map to a single-character chain ID
            if original_chainid not in chain_id_map:
                if chain_id_counter >= len(allowed_chain_ids):
                    emsg = 'ERROR!! Too many unique chains; ran out of chain IDs to assign.\n'
                    sys.stderr.write(emsg)
                    sys.exit(1)
                new_chainid = allowed_chain_ids[chain_id_counter]
                chain_id_map[original_chainid] = new_chainid
                chain_id_counter += 1
            else:
                new_chainid = chain_id_map[original_chainid]

            chainid = new_chainid

            # Residue number
            fid = labels.get('_atom_site.auth_seq_id') or labels.get('_atom_site.label_seq_id')
            resnum = int(fields[fid])

            icode = fields[labels.get('_atom_site.pdbx_PDB_ins_code')]
            if icode in empty:
                icode = ' '

            x = float(fields[labels.get('_atom_site.Cartn_x')])
            y = float(fields[labels.get('_atom_site.Cartn_y')])
            z = float(fields[labels.get('_atom_site.Cartn_z')])
            occ = float(fields[labels.get('_atom_site.occupancy')])
            bfactor = float(fields[labels.get('_atom_site.B_iso_or_equiv')])

            charge = fields[labels.get('_atom_site.pdbx_formal_charge')]
            if charge in empty:
                charge = '  '

            segid = original_chainid[:4]  # Optionally store original chain ID in segID (max 4 chars)

            atom_line = _a.format(record, serial, atname, altloc, resname,
                                  chainid, resnum, icode, x, y, z, occ, bfactor,
                                  segid, element, charge)

            atom_num += 1

            # Check for PDB format limitations
            if atom_num > 99999:
                emsg = f"ERROR!! Number of atoms exceeds PDB format limit: '{atom_num}'\n"
                sys.stderr.write(emsg)
                sys.exit(1)
            elif resnum > 9999:
                emsg = f"ERROR!! Residue number exceeds PDB format limit: '{resnum}'\n"
                sys.stderr.write(emsg)
                sys.exit(1)

            model_data[-1].append(atom_line)

    # Write the chain ID mapping to a JSON file
    json_filename = f"{output_prefix}_chain_id_map.json"
    with open(json_filename, 'w') as json_file:
        json.dump(chain_id_map, json_file, indent=4)
    # Inform the user
    sys.stderr.write(f"Chain ID mapping saved to {json_filename}\n")

    # Write the output
    is_ensemble = len(model_data) > 1
    if is_ensemble:
        for model_no, model in enumerate(model_data, start=1):
            yield f"MODEL     {model_no:>4d}\n"
            for line in model:
                yield line
            yield 'ENDMDL\n'
    else:
        for line in model_data[0]:
            yield line

    yield "END\n"