# Copyright 2004-2008 Nanorex, Inc. See LICENSE file for details. """ files_pdb.py -- reading and writing PDB files @version: $Id$ @copyright: 2004-2008 Nanorex, Inc. See LICENSE file for details. History: This was part of fileIO.py, until bruce 050414 started splitting that into separate modules for each file format. bruce 070410 added some hacks to read more pdb files successfully -- but they need review by someone who knows whether they're correct or not. (I'll commit this to both branches, Qt3 & Qt4, since this file is presently identical in both, so doing that should not cause a problem.) """ import os, time from model.chunk import Chunk from model.chem import Atom from model.bonds import bond_atoms from operations.bonds_from_atoms import inferBonds from string import capitalize from model.elements import PeriodicTable, Singlet from platform_dependent.PlatformDependent import fix_plurals from utilities.Log import redmsg, orangemsg from geometry.VQT import A from utilities.version import Version from utilities.debug_prefs import debug_pref, Choice_boolean_False from datetime import datetime from model.Comment import Comment from foundation.Group import Group import foundation.env as env from utilities.constants import gensym from protein.model.Protein import Protein from protein.model.Residue import Residue def _readpdb(assy, filename, isInsert = False, showProgressDialog = False, chainId = None): """ Read a Protein DataBank-format file into a single new chunk, which is returned unless there are no atoms in the file, in which case a warning is printed and None is returned. (The new chunk (if returned) is in assy, but is not yet added into any Group or Part in assy -- caller must do that.) Unless isInsert = True, set assy.filename to match the file we read, even if we return None. @param assy: The assembly. @type assy: L{assembly} @param filename: The PDB filename to read. @type filename: string @param isInsert: If True, the PDB file will be inserted into the current assembly. If False (default), the PDB is opened as the assembly. @param isInsert: boolean @param showProgressDialog: if True, display a progress dialog while reading a file. @type showProgressDialog: boolean @return: A chunk containing the contents of the PDB file. @rtype: L{Chunk} @see: U{B{PDB File Format}} """ fi = open(filename,"rU") lines = fi.readlines() fi.close() dir, nodename = os.path.split(filename) if not isInsert: assy.filename = filename ndix = {} mol = Chunk(assy, nodename) numconects = 0 atomname_exceptions = { "HB":"H", #k these are all guesses -- I can't find this documented # anywhere [bruce 070410] ## "HE":"H", ### REVIEW: I'm not sure about this one -- ### leaving it out means it's read as Helium, # but including it erroneously might prevent reading an actual Helium # if that was intended. # Guess for now: include it for ATOM but not HETATM. (So it's # specialcased below, rather than being included in this table.) # (Later: can't we use the case of the 'E' to distinguish it from He?) "HN":"H", } # Create and display a Progress dialog while reading the MMP file. # One issue with this implem is that QProgressDialog always displays # a "Cancel" button, which is not hooked up. I think this is OK for now, # but later we should either hook it up or create our own progress # dialog that doesn't include a "Cancel" button. --mark 2007-12-06 if showProgressDialog: _progressValue = 0 _progressFinishValue = len(lines) win = env.mainwindow() win.progressDialog.setLabelText("Reading file...") win.progressDialog.setRange(0, _progressFinishValue) _progressDialogDisplayed = False _timerStart = time.time() for card in lines: key = card[:6].lower().replace(" ", "") if key in ["atom", "hetatm"]: ## sym = capitalize(card[12:14].replace(" ", "").replace("_", "")) # bruce 080508 revision (guess at a bugfix for reading NE1-saved # pdb files): # get a list of atomnames to try; use the first one we recognize. # Note that full atom name is in columns 13-16 i.e. card[12:16]; # see http://www.wwpdb.org/documentation/format2.3-0108-us.pdf, # page 156. The old code only looked at two characters, # card[12:14] == columns 13-14, and discarded ' ' and '_', # and capitalized (the first character only). The code as I revised # it on 070410 also discarded digits, and handled HB, HE, HN # (guesses) using the atomname_exceptions dict. name4 = card[12:16].replace(" ", "").replace("_", "") name3 = card[12:15].replace(" ", "").replace("_", "") name2 = card[12:14].replace(" ", "").replace("_", "") def nodigits(name): for bad in "0123456789": name = name.replace(bad, "") return name atomnames_to_try = [ name4, # as seems best according to documentation name3, name2, # like old code nodigits(name4), nodigits(name3), nodigits(name2) # like code as revised on 070410 ] foundit = False for atomname in atomnames_to_try: atomname = atomname_exceptions.get(atomname, atomname) if atomname == "HE" and key == "atom": atomname = "H" # see comment in atomname_exceptions sym = capitalize(atomname) # turns either 'he' or 'HE' into 'He' try: PeriodicTable.getElement(sym) except: # note: this typically fails with AssertionError # (not e.g. KeyError) [bruce 050322] continue else: foundit = True break pass if not foundit: msg = "Warning: Pdb file: will use Carbon in place of unknown element %s in: %s" \ % (name4, card) print msg #bruce 070410 added this print env.history.message( redmsg( msg )) ##e It would probably be better to create a fake atom, so the # CONECT records would still work. #bruce 080508 let's do that: sym = "C" # Better still might be to create a fake element, # so we could write out the pdb file again # (albeit missing lots of info). [bruce 070410 comment] # Note: an advisor tells us: # PDB files sometimes encode atomtypes, # using C_R instead of C, for example, to represent sp2 # carbons. # That particular case won't trigger this exception, since we # only look at 2 characters [eventually, after trying more, as of 080508], # i.e. C_ in that case. It would be better to realize this means # sp2 and set the atomtype here (and perhaps then use it when # inferring bonds, which we do later if the file doesn't have # any bonds). [bruce 060614/070410 comment] # Now the element name is in sym. xyz = map(float, [card[30:38], card[38:46], card[46:54]] ) n = int(card[6:11]) a = Atom(sym, A(xyz), mol) ndix[n] = a elif key == "conect": try: a1 = ndix[int(card[6:11])] except: #bruce 050322 added this level of try/except and its message; # see code below for at least two kinds of errors this might # catch, but we don't try to distinguish these here. BTW this # also happens as a consequence of not finding the element # symbol, above, since atoms with unknown elements are not # created. env.history.message( redmsg( "Warning: Pdb file: can't find first atom in CONECT record: %s" % (card,) )) else: for i in range(11, 70, 5): try: a2 = ndix[int(card[i:i+5])] except ValueError: # bruce 050323 comment: # we assume this is from int('') or int(' ') etc; # this is the usual way of ending this loop. break except KeyError: #bruce 050322-23 added history warning for this, # assuming it comes from ndix[] lookup. env.history.message( redmsg( "Warning: Pdb file: can't find atom %s in: %s" % (card[i:i+5], card) )) continue bond_atoms(a1, a2) numconects += 1 if showProgressDialog: # Update the progress dialog. _progressValue += 1 if _progressValue >= _progressFinishValue: win.progressDialog.setLabelText("Building model...") elif _progressDialogDisplayed: win.progressDialog.setValue(_progressValue) else: _timerDuration = time.time() - _timerStart if _timerDuration > 0.25: # Display progress dialog after 0.25 seconds win.progressDialog.setValue(_progressValue) _progressDialogDisplayed = True if showProgressDialog: # Make the progress dialog go away. win.progressDialog.setValue(_progressFinishValue) #bruce 050322 part of fix for bug 433: don't return an empty chunk if not mol.atoms: env.history.message( redmsg( "Warning: Pdb file contained no atoms")) return None if numconects == 0: msg = orangemsg("PDB file has no bond info; inferring bonds") env.history.message(msg) # let user see message right away (bond inference can take significant # time) [bruce 060620] env.history.h_update() inferBonds(mol) return mol # PDB atom types for proteins. Assumes that the remaining atom types # have default hybridizations. PROTEIN_ATOM_TYPES = { "ANY" : { "N" : "sp2(graphitic)", # these atom types are common for all amino "C" : "sp2", # acids "O" : "sp2" }, "PHE" : { "CG" : "sp2aro", # "sp2a" is an sp2 atom connected to another sp2a atom "CD1" : "sp2aro", # with an aromatic bond "CE1" : "sp2aro", "CZ" : "sp2aro", "CE2" : "sp2aro", "CD2" : "sp2aro" }, "GLU" : { "CD" : "sp2", "OE1" : "sp2" }, "GLN" : { "CD" : "sp2", "OE1" : "sp2" }, "ASP" : { "CG" : "sp2", "OD1" : "sp2" }, "ASN" : { "CG" : "sp2", "OD1" : "sp2" }, "TRP" : { "CG" : "sp2", "CD1" : "sp2", "CE2" : "sp2aro", "CZ2" : "sp2aro", "CH2" : "sp2aro", "CZ3" : "sp2aro", "CE3" : "sp2aro", "CD2" : "sp2aro" }, "TYR" : { "CG" : "sp2aro", "CD1" : "sp2aro", "CE1" : "sp2aro", "CZ" : "sp2aro", "CE2" : "sp2aro", "CD2" : "sp2aro" }, "ARG" : { "CZ" : "sp2", "NH2" : "sp2" }, "HIS" : { "CG" : "sp2", "CE1" : "sp2a", "NE2" : "sp2a", # Two "sp2s" atoms are connected with a bond "CD2" : "sp2" } } # of order 1. I introduced this temporary marker # to describe conjugated double bond systems. # If the one of the "a", "b", or "c" markers # is specified, only the atoms sharing the # same marker will be double-bonded. # " DG", "DC", "DT", and " DA" names correspond to deoxynucleotides # as defined in PDB format >= 3.0 # Older versions didn't distinguish between RNA and DNA base names # so both compounds used " G", " C", " A", " T" (and " U" in case of # RNA) NUCLEIC_ATOM_TYPES = { " DC" : { "P" : "sp3(p)", # phosphate phosphorus "OP1" : "sp2(-.5)", # and two negatively charged oxygens "OP2" : "sp2(-.5)", "C2" : "sp2a", "O2" : "sp2a", "C4" : "sp2b", "N3" : "sp2b", "C5" : "sp2a", "C6" : "sp2a" }, " DG" : { "P" : "sp3(p)", "OP1" : "sp2(-.5)", "OP2" : "sp2(-.5)", "C4" : "sp2b", "C5" : "sp2b", "C8" : "sp2a", "N7" : "sp2a", "C2" : "sp2a", "N3" : "sp2a", "C6" : "sp2c", "O6" : "sp2c" }, " DA" : { "P" : "sp3(p)", "OP1" : "sp2(-.5)", "OP2" : "sp2(-.5)", "C2" : "sp2a", "N3" : "sp2a", "C6" : "sp2b", "N1" : "sp2b", "N4" : "sp2b", "C4" : "sp2c", "C5" : "sp2c", "N7" : "sp2a", "C8" : "sp2a" }, " DT" : { "P" : "sp3(p)", "OP1" : "sp2(-.5)", "OP2" : "sp2(-.5)", "C5" : "sp2a", "C6" : "sp2a", "C4" : "sp2b", "O4" : "sp2b", "C2" : "sp2a", "O2" : "sp2a" }, " C" : { "P" : "sp3(p)", "O1P" : "sp2(-.5)", "O2P" : "sp2(-.5)", "C2" : "sp2a", "O2" : "sp2a", "C4" : "sp2b", "N3" : "sp2b", "C5" : "sp2a", "C6" : "sp2a" }, " G" : { "P" : "sp3(p)", "O1P" : "sp2(-.5)", "O2P" : "sp2(-.5)", "C4" : "sp2b", "C5" : "sp2b", "C8" : "sp2a", "N7" : "sp2a", "C2" : "sp2a", "N3" : "sp2a", "C6" : "sp2c", "O6" : "sp2c" }, " A" : { "P" : "sp3(p)", "O1P" : "sp2(-.5)", "O2P" : "sp2(-.5)", "C2" : "sp2a", "N3" : "sp2a", "C6" : "sp2b", "N1" : "sp2b", "N4" : "sp2b", "C4" : "sp2c", "C5" : "sp2c", "N7" : "sp2a", "C8" : "sp2a" }, " T" : { "P" : "sp3(p)", "O1P" : "sp2(-.5)", "O2P" : "sp2(-.5)", "C5" : "sp2a", "C6" : "sp2a", "C4" : "sp2b", "O4" : "sp2b", "C2" : "sp2a", "O2" : "sp2a" }, } # Important notes: (piotr 080909) # # - the PDB reading code can read standard Protein Data Bank files and preserve # ATOM and HETATM record information, so the molecules can be exported as PDB # and recognized by other programs # # - not all information contained in PDB files is preserved, only per-atom fields # # - only first model is read from multi-model files, # # - "Heteroatoms" group is created in Model Tree even if there are no heteroatoms # # - inferBonds method is used to rebuild bonds for proteins and DNA. This should # be replaced by bond assignment using pattern matching. Currently, only # proper bond orders are assigned for standard residues, but connectivity # is computed using the inferBonds method. # # - PAM3 and PAM5 models can be written and read using the new code, # although - as it was before - the internal DNA representation is not preserved # # - the old PDB I/O code still remains there, and will be used if ENABLE_PROTEINS # debug pref is set to False (perhaps this behavior should be controlled by # another debug pref) def _readpdb_new(assy, filename, isInsert = False, showProgressDialog = False, chainId = None): """ Read a Protein DataBank-format file into a single new chunk, which is returned unless there are no atoms in the file, in which case a warning is printed and None is returned. (The new chunk (if returned) is in assy, but is not yet added into any Group or Part in assy -- caller must do that.) Unless isInsert = True, set assy.filename to match the file we read, even if we return None. @param assy: The assembly. @type assy: L{assembly} @param filename: The PDB filename to read. @type filename: string @param isInsert: If True, the PDB file will be inserted into the current assembly. If False (default), the PDB is opened as the assembly. @param isInsert: boolean @param showProgressDialog: if True, display a progress dialog while reading a file. @type showProgressDialog: boolean @return: A chunk containing the contents of the PDB file. @rtype: L{Chunk} @see: U{B{PDB File Format}} """ # These methods should be probably moved to this file. from protein.model.Protein import is_water, is_amino_acid, is_nucleotide def _add_bondpoints(mol): """ Adds missing bondpoints to a molecule. @param mol: molecule @type mol: Chunk """ atlist = [atom for (key, atom) in mol.atoms.items()] for atom in atlist: atom.make_enough_bondpoints() def _set_atom_type(atom, atom_name, res_name): """ Assigns an atom type based on atom name and residue name by simple name pattern matching. @param atom_name: PDB name of the atom @type atom_name: string @param res_name: PDB residue name @type res_name: string """ atom.setOverlayText(atom_name) _assigned = False ### print (res_name, atom_name) # Look for the atom type and set the type if PROTEIN_ATOM_TYPES.has_key(res_name): # Found a protein residue. atom_type_dict = PROTEIN_ATOM_TYPES[res_name] if atom_type_dict.has_key(atom_name): # Found the atom. atom_type = atom_type_dict[atom_name] if atom_type == "sp2a": sp2a_atoms.append(atom) atom_type = "sp2" if atom_type == "sp2c": sp2c_atoms.append(atom) atom_type = "sp2" if atom_type == "sp2b": sp2b_atoms.append(atom) atom_type = "sp2" if atom_type == "sp2aro": aromatic_atoms.append(atom) atom_type = "sp2" atom.set_atomtype_but_dont_revise_singlets(atom_type) _assigned = True elif NUCLEIC_ATOM_TYPES.has_key(res_name): # Found a nucleic acid residue atom_type_dict = NUCLEIC_ATOM_TYPES[res_name] if atom_type_dict.has_key(atom_name): # Found the atom. atom_type = atom_type_dict[atom_name] if atom_type == "sp2a": sp2a_atoms.append(atom) atom_type = "sp2" if atom_type == "sp2b": sp2b_atoms.append(atom) atom_type = "sp2" if atom_type == "sp2c": sp2c_atoms.append(atom) atom_type = "sp2" atom.set_atomtype_but_dont_revise_singlets(atom_type) _assigned = True if not _assigned: # Look for common atom types (N, C, O) atom_type_dict = PROTEIN_ATOM_TYPES["ANY"] # For remaining residues, look at one of the standard atom types if atom_type_dict.has_key(atom_name): atom_type = atom_type_dict[atom_name] atom.set_atomtype_but_dont_revise_singlets(atom_type) def _finalize_molecule(): """ Performs some operations after reading the entire PDB chain: - rebuild (infer) bonds for protein and DNA chains - assigns proper bond orders for standard residues - renames molecule to reflect a chain ID - deletes the "protein" attribute if it is not a protein - appends the molecule to the molecule list """ if mol == water: # Skip water, to be added to the mollist explicitly at the end. return if mol.atoms: # Create a molecule name by concatenating PDB ID and chain ID mol.name = pdbid.lower() + chainId if mol.protein.count_c_alpha_atoms() == 0 and \ not dont_split: # If there are C-alpha atoms and dont_split flag is not set, # consider the chunk non-protein. # Create a heteromolecule name mol.name = resName + '[' + resId.replace(' ','') + ']' hetgroup.addchild(mol) else: # For protein - infer the bonds anyway. This should be replaced # by a proper atom / bond type assignment, for example using # the templates present in the Peptide Generator. piotr 081908 inferBonds(mol) aromatic_atom_list = [] single_bonded_atom_list = [] # This is a protein. # Assign proper atom types according to protein templates. # amino_acid_list = mol.protein.get_amino_acids() #for aa in amino_acid_list: # aromatic, single_bonded = set_protein_atom_type(atom, # Assign bond orders. piotr 081908 from model.bond_constants import V_DOUBLE, V_AROMATIC, V_GRAPHITE # Inferring bond types for atom in mol.atoms.itervalues(): if atom.bonds: for bond in atom.bonds: atom1_type = bond.atom1.getAtomTypeName() atom2_type = bond.atom2.getAtomTypeName() if (atom1_type == "sp2" and atom2_type == "sp2"): if (bond.atom1 in aromatic_atoms and bond.atom2 in aromatic_atoms): bond.set_v6(V_AROMATIC) elif ((bond.atom1 in sp2a_atoms and bond.atom2 in sp2a_atoms) or (bond.atom1 in sp2b_atoms and bond.atom2 in sp2b_atoms) or (bond.atom1 in sp2c_atoms and bond.atom2 in sp2c_atoms) or (bond.atom1 not in sp2a_atoms and bond.atom1 not in sp2b_atoms and bond.atom1 not in sp2c_atoms and bond.atom1 not in aromatic_atoms)): bond.set_v6(V_DOUBLE) # for phosphate P - charged oxygen bond: assign V_GRAPHITE if ((atom1_type == "sp3(p)" and atom2_type == "sp2(-.5)") or ((atom2_type == "sp3(p)" and atom1_type == "sp2(-.5)"))): bond.set_v6(V_GRAPHITE) if mol.protein.count_c_alpha_atoms() == 0: # It is not a protein molecule: remove the protein information. mol.protein = None else: # Set the PDB information (chain ID and PDB code) mol.protein.set_chain_id(chainId) mol.protein.set_pdb_id(pdbid) # Add the molecule to mollist mollist.append(mol) # Create bondpoints for easy hydrogenation _add_bondpoints(mol) else: env.history.message( redmsg( "Warning: PDB residue contained no atoms")) env.history.h_update() pass pass # _finalize_molecule # Read the file contents. fi = open(filename,"rU") lines = fi.readlines() fi.close() assy.part.ensure_toplevel_group() aromatic_atoms = [] sp2a_atoms = [] sp2b_atoms = [] sp2c_atoms = [] # List of molecules read from PDB file. mollist = [] # Lists of secondary structure tuples (res_id, chain_id) helix = [] sheet = [] turn = [] dir, nodename = os.path.split(filename) if not isInsert: assy.filename = filename # dictionary for HETATM connectivity reconstruction ndix = {} lastResId = None # Create a molecule chunk mol = Chunk(assy, nodename) mol.protein = Protein() dont_split = False hetgroup = Group("Heteroatoms", assy, assy.part.topnode) # Create a chunk to store water molecules. water = Chunk(assy, nodename) numconects = 0 comment_text = "" _read_rosetta_info = False comment_title = "PDB Header" # Create a temporary PDB ID - it should be later extracted from the # file header. pdbid = nodename.replace(".pdb","").lower() atomname_exceptions = { "HB":"H", #k these are all guesses -- I can't find this documented # anywhere [bruce 070410] "CA":"C", "NE":"N", "HG":"H", ## "HE":"H", ### REVIEW: I'm not sure about this one -- ### leaving it out means it's read as Helium, # but including it erroneously might prevent reading an actual Helium # if that was intended. # Guess for now: include it for ATOM but not HETATM. (So it's # specialcased below, rather than being included in this table.) # (Later: can't we use the case of the 'E' to distinguish it from He?) "HN":"H", } # Create and display a Progress dialog while reading the MMP file. # One issue with this implem is that QProgressDialog always displays # a "Cancel" button, which is not hooked up. I think this is OK for now, # but later we should either hook it up or create our own progress # dialog that doesn't include a "Cancel" button. --mark 2007-12-06 if showProgressDialog: _progressValue = 0 _progressFinishValue = len(lines) win = env.mainwindow() win.progressDialog.setLabelText("Reading file...") win.progressDialog.setRange(0, _progressFinishValue) _progressDialogDisplayed = False _timerStart = time.time() for card in lines: key = card[:6].lower().replace(" ", "") if key in ["atom", "hetatm"]: # Set _is_hetero flag for HETATM if key == "atom": _is_hetero = False else: _is_hetero = True ## sym = capitalize(card[12:14].replace(" ", "").replace("_", "")) # bruce 080508 revision (guess at a bugfix for reading NE1-saved # pdb files): # get a list of atomnames to try; use the first one we recognize. # Note that full atom name is in columns 13-16 i.e. card[12:16]; # see http://www.wwpdb.org/documentation/format2.3-0108-us.pdf, # page 156. The old code only looked at two characters, # card[12:14] == columns 13-14, and discarded ' ' and '_', # and capitalized (the first character only). The code as I revised # it on 070410 also discarded digits, and handled HB, HE, HN # (guesses) using the atomname_exceptions dict. name4 = card[12:16].replace(" ", "").replace("_", "") name3 = card[12:15].replace(" ", "").replace("_", "") name2 = card[12:14].replace(" ", "").replace("_", "") chainId = card[21] resId = card[22:26].replace(" ", "") + card[27] if lastResId == None: lastResId = resId resName = card[17:20] _is_water = is_water(resName) _is_amino_acid = is_amino_acid(resName) _is_nucleotide = is_nucleotide(resName) sym = card[77:79] # Element symbol alt = card[16] # Alternate location indicator if alt != ' ' and \ alt != 'A': # Skip non-standard alternate location # This is not very safe test, it should preserve # the remaining atoms. piotr 080715 continue ###ATOM 131 CB ARG A 18 104.359 32.924 58.573 1.00 36.93 C def nodigits(name): for bad in "0123456789": name = name.replace(bad, "") return name atomnames_to_try = [ name4, # as seems best according to documentation name3, name2, # like old code nodigits(name4), nodigits(name3), nodigits(name2) # like code as revised on 070410 ] # piotr 080819: first look at the 77-78 field - it should include # the element symbol. foundit = False try: PeriodicTable.getElement(sym) except: pass else: foundit = True # if not found, look at possible atom names if not foundit: for atomname in atomnames_to_try: atomname = atomname_exceptions.get(atomname, atomname) if atomname[0] == 'H' and key == "atom": atomname = "H" # see comment in atomname_exceptions sym = capitalize(atomname) # turns either 'he' or 'HE' into 'He' try: PeriodicTable.getElement(sym) except: # note: this typically fails with AssertionError # (not e.g. KeyError) [bruce 050322] continue else: foundit = True break pass if not foundit: msg = "Warning: Pdb file: will use Carbon in place of unknown element %s in: %s" \ % (name4, card) print msg #bruce 070410 added this print env.history.message( redmsg( msg )) ##e It would probably be better to create a fake atom, so the # CONECT records would still work. #bruce 080508 let's do that: sym = "C" # Better still might be to create a fake element, # so we could write out the pdb file again # (albeit missing lots of info). [bruce 070410 comment] # Note: an advisor tells us: # PDB files sometimes encode atomtypes, # using C_R instead of C, for example, to represent sp2 # carbons. # That particular case won't trigger this exception, since we # only look at 2 characters [eventually, after trying more, as of 080508], # i.e. C_ in that case. It would be better to realize this means # sp2 and set the atomtype here (and perhaps then use it when # inferring bonds, which we do later if the file doesn't have # any bonds). [bruce 060614/070410 comment] # Now the element name is in sym. xyz = map(float, [card[30:38], card[38:46], card[46:54]] ) n = int(card[6:11]) if resId != lastResId and \ not _is_amino_acid and \ not _is_nucleotide and \ not _is_water: # Finalize current molecule. _finalize_molecule() # Discard the original molecule and create a new one. mol = Chunk(assy, nodename) mol.protein = Protein() dont_split = False if _is_water: # If this is a water molecule, add the atom to the Water chunk a = Atom(sym, A(xyz), water) else: # Otherwise, add it to the current molecule. a = Atom(sym, A(xyz), mol) # Store PDB information in the Atom object pdb_info dict. if not a.pdb_info: # Create the pdb_info dictionary if it doesn't exist. a.pdb_info = {} # Store PDB atom properties in the pdb_info dict a.pdb_info['atom_name'] = name4 a.pdb_info['residue_id'] = resId a.pdb_info['residue_name'] = resName a.pdb_info['chain_id'] = chainId if not _is_hetero: # The 'standard_atom' key represents a bool value set to # true if this atom is "standard PDB atom", e.g. it was # read from ATOM record. a.pdb_info['standard_atom'] = True # Normally, the connectivity information is only available # for HETATM records. But other programs can write CONECT info # for ATOM records, as well. ndix[n] = a if _is_amino_acid or \ _is_nucleotide: # Don't split proteins or nucleotides into individual # residues. What about carbohydrates? piotr 081908 dont_split = True if not _is_water: # Adds the atom to the "protein" chunk. mol.protein.add_pdb_atom(a, name4, resId, resName, setType=True) if _is_amino_acid or \ _is_nucleotide: # Recognize atom type by pattern matching of the atom name. # Do this only for proteins and nucleic acids. _set_atom_type(a, name4, resName) # Assign one of three types of secondary structure. if (resId, chainId) in helix: # helix mol.protein.assign_helix(resId) if (resId, chainId) in sheet: # extended mol.protein.assign_strand(resId) if (resId, chainId) in turn: # turn mol.protein.assign_turn(resId) # Remember the most recent resId lastResId = resId elif key == "conect": try: a1 = ndix[int(card[6:11])] except: #bruce 050322 added this level of try/except and its message; # see code below for at least two kinds of errors this might # catch, but we don't try to distinguish these here. BTW this # also happens as a consequence of not finding the element # symbol, above, since atoms with unknown elements are not # created. env.history.message( redmsg( "Warning: Pdb file: can't find first atom in CONECT record: %s" % (card,) )) else: for i in range(11, 70, 5): try: a2 = ndix[int(card[i:i+5])] except ValueError: # bruce 050323 comment: # we assume this is from int('') or int(' ') etc; # this is the usual way of ending this loop. break except KeyError: #bruce 050322-23 added history warning for this, # assuming it comes from ndix[] lookup. env.history.message( redmsg( "Warning: Pdb file: can't find atom %s in: %s" % (card[i:i+5], card) )) continue bond_atoms(a1, a2) numconects += 1 elif key == "ter": # Finalize current molecule. _finalize_molecule() # Discard the original molecule and create a new one. mol = Chunk(assy, nodename) mol.protein = Protein() dont_split = False ### numconects = 0 # HEADER, COMPND and REMARK fields are added to the comment text elif key == "header": # Extract PDB ID from the header string. pdbid = card[62:66].lower() comment_text += card elif key == "compnd": comment_text += card elif key == "remark": comment_text += card elif key == "model": # Check out the MODEL record, ignore everything other than MODEL 1. # This behavior should be optional and set via User Preference. # piotr 080714 model_id = int(card[6:20]) if model_id > 1: env.history.message( redmsg( "Warning: multi-model file; skipping remaining models.")) env.history.h_update() # Skip remaining part of the file. break elif key in ["helix", "sheet", "turn"]: # Read secondary structure information. if key == "helix": begin = int(card[22:25]) end = int(card[34:37]) chainId = card[19] for s in range(begin, end+1): helix.append((s, chainId)) elif key == "sheet": begin = int(card[23:26]) end = int(card[34:37]) chainId = card[21] for s in range(begin, end+1): sheet.append((s, chainId)) elif key == "turn": begin = int(card[23:26]) end = int(card[34:37]) chainId = card[19] for s in range(begin, end+1): turn.append((s, chainId)) else: # Rosetta-written PDB files include scoring information. if card[7:15] == "ntrials:": _read_rosetta_info = True comment_text += "Rosetta Scoring Analysis\n" if card[9:15] == "score:": score = float(card[16:]) comment_title = "Rosetta Score: %g" % score if _read_rosetta_info: comment_text += card if showProgressDialog: # Update the progress dialog. _progressValue += 1 if _progressValue >= _progressFinishValue: win.progressDialog.setLabelText("Building model...") elif _progressDialogDisplayed: win.progressDialog.setValue(_progressValue) else: _timerDuration = time.time() - _timerStart if _timerDuration > 0.25: # Display progress dialog after 0.25 seconds win.progressDialog.setValue(_progressValue) _progressDialogDisplayed = True if showProgressDialog: # Make the progress dialog go away. win.progressDialog.setValue(_progressFinishValue) _finalize_molecule() mollist.append(hetgroup) if water.atoms: # Rebuild bonds in case there are hydrogens present. inferBonds(water) # Add bondpoints. _add_bondpoints(water) # Check if there are any water molecules water.name = "Solvent" # The water should be hidden by default. water.hide() mollist.append(water) return (mollist, comment_text, comment_title) # read oa Protein DataBank-format file or insert it into a single Chunk # piotr 080715 refactored "read" and "insert" code so they both call # this method instead having a redundant code (the only difference is # "isInsert" parameter. #bruce 050322 revised this for bug 433 def read_or_insert_pdb(assy, filename, showProgressDialog = False, chainId = None, isInsert = False): """ Reads (loads) a PDB file, or inserts it into an existing chunk. @param assy: The assembly. @type assy: L{assembly} @param filename: The PDB filename to read. @type filename: string @param showProgressDialog: if True, display a progress dialog while reading a file. Default is False. @type showProgressDialog: boolean """ from utilities.GlobalPreferences import ENABLE_PROTEINS if ENABLE_PROTEINS: molecules, comment_text, comment_title = _readpdb_new(assy, filename, isInsert = isInsert, showProgressDialog = showProgressDialog, chainId = chainId) if molecules: assy.part.ensure_toplevel_group() dir, name = os.path.split(filename) #name = gensym(nodename, assy) group = Group(name, assy, assy.part.topnode) for mol in molecules: if mol is not None: group.addchild(mol) comment = Comment(assy, comment_title, comment_text) group.addchild(comment) assy.addnode(group) else: mol = _readpdb(assy, filename, isInsert = isInsert, showProgressDialog = showProgressDialog, chainId = chainId) if mol is not None: assy.addmol(mol) return # read a Protein DataBank-format file into a single Chunk #bruce 050322 revised this for bug 433 def readpdb(assy, filename, showProgressDialog = False, chainId = None): """ Reads (loads) a PDB file. @param assy: The assembly. @type assy: L{assembly} @param filename: The PDB filename to read. @type filename: string @param showProgressDialog: if True, display a progress dialog while reading a file. Default is False. @type showProgressDialog: boolean """ read_or_insert_pdb(assy, filename, showProgressDialog = showProgressDialog, chainId = chainId, isInsert = False) # Insert a Protein DataBank-format file into a single Chunk #bruce 050322 revised this for bug 433 def insertpdb(assy, filename, chainId = None): """ Reads a pdb file and inserts it into the existing model. @param assy: The assembly. @type assy: L{assembly} @param filename: The PDB filename to read. @type filename: string """ read_or_insert_pdb(assy, filename, showProgressDialog = True, chainId = chainId, isInsert = True) # Write a PDB ATOM record record. # Copied and modified version of chem.py Atom.writepdb method. # Should be moved back to Atom class as soon as we decided that the # new implementation is satisfactory. # piotr 080710 def writepdb_atom(atom, file, atomSerialNumber, atomName, chainId, resId, \ resName, hetatm, occup, temp): """ Write a PDB ATOM record for the atom into I{file}. @param atom: The atom. @type atom: Atom @param file: The PDB file to write the ATOM record to. @type file: file @param atomSerialNumber: A unique number for this atom. @type atomSerialNumber: int @param chainId: The chain id. It is a single character. See the PDB documentation for the ATOM record more information. @type chainId: str @note: If you edit the ATOM record, be sure to to test QuteMolX. @see: U{B{ATOM Record Format}} """ space = " " # Begin ATOM record ---------------------------------- # Column 1-6: "ATOM " (str) or "HETATM" depending on a type of atom if hetatm: atomRecord = "HETATM" else: atomRecord = "ATOM " # Column 7-11: Atom serial number (int) atomRecord += "%5d" % atomSerialNumber # Column 12: Whitespace (str) atomRecord += "%1s" % space # Column 13-16 Atom name (str) # piotr 080710: moved Atom name to column 13 if len(atomName) == 4: atomRecord += "%-4s" % atomName[:4] else: atomRecord += " %-3s" % atomName[:3] # Column 17: Alternate location indicator (str) *unused* atomRecord += "%1s" % space # Column 18-20: Residue name - unused (str) atomRecord += "%3s" % resName # Column 21: Whitespace (str) atomRecord += "%1s" % space # Column 22: Chain identifier - single letter (str) # This has been tested with 35 chunks and still works in QuteMolX. atomRecord += "%1s" % chainId.upper() # Column 23-27: Residue sequence number (int) + Code for insertion of residues (AChar) atomRecord += "%5s" % resId # Column 28-30: Whitespace (str) atomRecord += "%3s" % space # Get atom XYZ coordinate _xyz = atom.posn() # Column 31-38: X coord in Angstroms (float 8.3) atomRecord += "%8.3f" % float(_xyz[0]) # Column 39-46: Y coord in Angstroms (float 8.3) atomRecord += "%8.3f" % float(_xyz[1]) # Column 47-54: Z coord in Angstroms (float 8.3) atomRecord += "%8.3f" % float(_xyz[2]) # Column 55-60: Occupancy (float 6.2) - should be "1.0" atomRecord += "%6.2f" % occup # Column 61-66: Temperature factor. (float 6.2) *unused* atomRecord += "%6.2f" % temp # Column 67-76: Whitespace (str) atomRecord += "%10s" % space # Column 77-78: Element symbol, right-justified (str) atomRecord += "%2s" % atom.element.symbol[:2] # Column 79-80: Charge on the atom (str) *unused* atomRecord += "%2s\n" % space # End ATOM record ---------------------------------- file.write(atomRecord) return # Write all Chunks into a Protein DataBank-format file # [bruce 050318 revised comments, and made it not write singlets or their bonds, # and made it not write useless 1-atom CONECT records, and include each bond # in just one CONECT record instead of two.] # PDB exclude flags, used by writepdb() and its callers. # Ask Bruce what "constants" file these should be moved to (if any). # Mark 2007-06-11 WRITE_ALL_ATOMS = 0 EXCLUDE_BONDPOINTS = 1 EXCLUDE_HIDDEN_ATOMS = 2 # excludes both hidden and invisible atoms. EXCLUDE_DNA_ATOMS = 4 EXCLUDE_DNA_AXIS_ATOMS = 8 EXCLUDE_DNA_AXIS_BONDS = 16 def writepdb(part, filename, mode = 'w', excludeFlags = EXCLUDE_BONDPOINTS | EXCLUDE_HIDDEN_ATOMS, singleChunk = None): """ Write a PDB file of the I{part}. @param part: The part. @type part: assembly @param filename: The fullpath of the PDB file to write. We don't care if it has the .pdb extension or not. @type filename: string @param mode: 'w' for writing (the default) 'a' for appending @type mode: string @param excludeFlags: used to exclude certain atoms from being written, where: WRITE_ALL_ATOMS = 0 (even writes hidden and invisble atoms) EXCLUDE_BONDPOINTS = 1 (excludes bondpoints) EXCLUDE_HIDDEN_ATOMS = 2 (excludes invisible atoms) EXCLUDE_DNA_ATOMS = 4 (excludes PAM3 and PAM5 pseudo atoms) EXCLUDE_DNA_AXIS_ATOMS = 8 (excludes PAM3 axis atoms) EXCLUDE_DNA_AXIS_BONDS = 16 (suppresses PAM3 axis bonds) @type excludeFlags: integer @note: Atoms and bonds of hidden chunks are never written. @see: U{B{PDB File Format}} """ if mode != 'a': # Precaution. Mark 2007-06-25 mode = 'w' f = open(filename, mode) # doesn't yet detect errors in opening file [bruce 050927 comment] # Atom object's key is the key, the atomSerialNumber is the value atomsTable = {} # Each element of connectLists is a list of atoms to be connected with the # 1st atom in the list, i.e. the atoms to write into a CONECT record connectLists = [] atomSerialNumber = 1 from utilities.GlobalPreferences import ENABLE_PROTEINS def exclude(atm): #bruce 050318 """ Exclude this atom (and bonds to it) from the file under the following conditions (as selected by excludeFlags): - if it is a singlet - if it is not visible - if it is a member of a hidden chunk - some dna-related conditions (see code for details) """ # Added not visible and hidden member of chunk. This effectively deletes # these atoms, which might be considered a bug. # Suggested solutions: # - if the current file is a PDB and has hidden atoms/chunks, warn user # before quitting NE1 (suggest saving as MMP). # - do not support native PDB. Open PDBs as MMPs; only allow export of # PDB. # Fixes bug 2329. Mark 070423 if excludeFlags & EXCLUDE_BONDPOINTS: if atm.element == Singlet: return True # Exclude if excludeFlags & EXCLUDE_HIDDEN_ATOMS: if not atm.visible(): return True # Exclude if excludeFlags & EXCLUDE_DNA_AXIS_ATOMS: ## if atm.element.symbol in ('Ax3', 'Ae3'): #bruce 080320 bugfix: revise to cover new elements and PAM5. if atm.element.role == 'axis': return True # Exclude if excludeFlags & EXCLUDE_DNA_ATOMS: # PAM5 atoms begin at 200. # # REVIEW: better to check atom.element.pam? # What about "carbon nanotube pseudoatoms"? # [bruce 080320 question] if atm.element.eltnum >= 200: return True # Exclude # Always exclude singlets connected to DNA p-atoms. if atm.element == Singlet: for a in atm.neighbors(): if a.element.eltnum >= 200: # REVIEW: see above comment about atom.element.pam vs >= 200 return True return False # Don't exclude. excluded = 0 molnum = 1 chainIdChar = 65 # ASCII "A" if mode == 'w': writePDB_Header(f) # get a list of chunks in model tree order mollist = part.nodes_in_mmpfile_order(nodeclass = Chunk) for mol in mollist: if singleChunk: mol = singleChunk if mol.hidden: # Atoms and bonds of hidden chunks are never written. continue # write atoms in proper order ordered_atoms = mol.atoms_in_mmp_file_order() for a in ordered_atoms: if exclude(a): excluded += 1 continue atomConnectList = [] atomsTable[a.key] = atomSerialNumber hetatm = True if ENABLE_PROTEINS: # piotr 080709 : Use more robust ATOM output code for Proteins. resId = " 1 " resName = " " atomName = a.element.symbol hetatm = True occup = 1.0 temp = 0.0 if a.pdb_info: if a.pdb_info.has_key('residue_id'): resId = a.pdb_info['residue_id'] if a.pdb_info.has_key('residue_name'): resName = a.pdb_info['residue_name'] if a.pdb_info.has_key('atom_name'): atomName = a.pdb_info['atom_name'] if a.pdb_info.has_key('standard_atom'): hetatm = False if a.pdb_info.has_key('chain_id'): chainIdChar = ord(a.pdb_info['chain_id'][0]) if a.pdb_info.has_key('occupancy'): occup = a.pdb_info['occupancy'] if a.pdb_info.has_key('temperature_factor'): temp = a.pdb_info['temperature_factor'] writepdb_atom(a, f, atomSerialNumber, atomName, chr(chainIdChar), resId, resName, hetatm, occup, temp) else: a.writepdb(f, atomSerialNumber, chr(chainIdChar)) if hetatm: atomConnectList.append(a) for b in a.bonds: a2 = b.other(a) # The following removes bonds b/w PAM3 axis atoms. if excludeFlags & EXCLUDE_DNA_AXIS_BONDS: ## if a.element.symbol in ('Ax3', 'Ae3'): ## if a2.element.symbol in ('Ax3', 'Ae3'): ## continue #bruce 080320 bugfix: revise to cover new elements and PAM5. if a.element.role == 'axis' and a2.element.role == 'axis': continue if a2.key in atomsTable: assert not exclude(a2) # see comment below atomConnectList.append(a2) #bruce 050318 comment: the old code wrote every bond twice # (once from each end). I doubt we want that, so now I only # write them from the 2nd-seen end. (This also serves to # not write bonds to excluded atoms, without needing to check # that directly. The assert verifies this claim.) if len(atomConnectList) > 1: connectLists.append(atomConnectList) # bruce 050318 comment: shouldn't we leave it out if # len(atomConnectList) == 1? # I think so, so I'm doing that (unlike the previous code). atomSerialNumber += 1 # Write the chain TER-minator record # # COLUMNS DATA TYPE FIELD DEFINITION # ------------------------------------------------------ # 1 - 6 Record name "TER " # 7 - 11 Integer serial Serial number. # 18 - 20 Residue name resName Residue name. # 22 Character chainID Chain identifier. # 23 - 26 Integer resSeq Residue sequence number. # 27 AChar iCode Insertion code. # piotr 080908 note: This is not exactly correct, the TER record # shouldn't be saved between consecutive non-standard residues # (HETATM records), e.g. water molecules shouldn't be separated # by TER records. f.write("TER %5d %1s\n" % (molnum, chr(chainIdChar))) molnum += 1 chainIdChar += 1 if chainIdChar > 126: # ASCII "~", end of PDB-acceptable chain chars chainIdChar = 32 # Rollover to ASCII " " if singleChunk: break for atomConnectList in connectLists: # Begin CONECT record ---------------------------------- f.write("CONECT") for a in atomConnectList: index = atomsTable[a.key] f.write("%5d" % index) f.write("\n") # End CONECT record ---------------------------------- connectLists = [] f.write("END\n") f.close() if excluded: msg = "Warning: excluded %d open bond(s) from saved PDB file; " \ % excluded msg += "consider Hydrogenating and resaving." msg = fix_plurals(msg) env.history.message( orangemsg(msg)) return # from writepdb def writePDB_Header(fileHandle): """ Writes an informative REMARK 5 header at the top of the PDB file with the given fileHandle. @param fileHandle: The file to write into. @type fileHandle: file """ version = Version() fileHandle.write("REMARK 5\n") fileHandle.write("REMARK 5 Created %s" % (datetime.utcnow().strftime("%Y/%m/%d %I:%M:%S %p UTC\n"))) fileHandle.write("REMARK 5 with NanoEngineer-1 version %s nanoengineer-1.com\n" % repr(version)) fileHandle.write("REMARK 5") fileHandle.write(""" REMARK 6 REMARK 6 This file generally complies with the PDB format version 2.3 REMARK 6 Notes: REMARK 6 REMARK 6 - Sets of atoms are treated as PDB chains terminated with TER REMARK 6 records. Since the number of atom sets can exceed the 95 possible REMARK 6 unique chains expressed in ATOM records by single non-control ASCII REMARK 6 characters, TER record serial numbers will be used to uniquely REMARK 6 identify atom sets/chains. Chain identifiers in ATOM records will REMARK 6 "roll-over" after every 95 chains. REMARK 6\n""")