/// <summary> /// Read a <see cref="IChemFile"/> from a file in PDB format. The molecules /// in the file are stored as <see cref="IBioPolymer"/>s in the /// <see cref="IChemFile"/>. The residues are the monomers of the /// <see cref="IBioPolymer"/>, and their names are the concatenation of the /// residue, chain id, and the sequence number. Separate chains (denoted by /// TER records) are stored as separate <see cref="IBioPolymer"/> molecules. /// </summary> /// <remarks> /// Connectivity information is not currently read. /// </remarks> /// <returns>The ChemFile that was read from the PDB file.</returns> private IChemFile ReadChemFile(IChemFile oFile) { // initialize all containers var oSeq = oFile.Builder.NewChemSequence(); var oModel = oFile.Builder.NewChemModel(); var oSet = oFile.Builder.NewAtomContainerSet(); // some variables needed var oBP = new PDBPolymer(); var molecularStructure = oFile.Builder.NewAtomContainer(); string cRead = ""; char chain = 'A'; // To ensure stringent name giving of monomers int lineLength = 0; bool isProteinStructure = false; atomNumberMap = new Dictionary <int, IAtom>(); if (readConnect.IsSet) { bondsFromConnectRecords = new List <IBond>(); } // do the reading of the Input try { do { cRead = oInput.ReadLine(); Debug.WriteLine($"Read line: {cRead}"); if (cRead != null) { lineLength = cRead.Length; // make sure the record name is 6 characters long if (lineLength < 6) { cRead = cRead + " "; } // check the first column to decide what to do var cCol = cRead.Substring(0, 6); switch (cCol.ToUpperInvariant()) { case "SEQRES": { isProteinStructure = true; } break; case "ATOM ": { #region // read an atom record var oAtom = ReadAtom(cRead, lineLength); if (isProteinStructure) { // construct a string describing the residue var cResidue = new StringBuilder(8); var oObj = oAtom.ResName; if (oObj != null) { cResidue = cResidue.Append(oObj.Trim()); } oObj = oAtom.ChainID; if (oObj != null) { // cResidue = cResidue.Append(((string)oObj).Trim()); cResidue = cResidue.Append(chain); } oObj = oAtom.ResSeq; if (oObj != null) { cResidue = cResidue.Append(oObj.Trim()); } // search for an existing strand or create a new one. var strandName = oAtom.ChainID; if (strandName == null || strandName.Length == 0) { strandName = chain.ToString(NumberFormatInfo.InvariantInfo); } var oStrand = oBP.GetStrand(strandName); if (oStrand == null) { oStrand = new PDBStrand { StrandName = strandName, Id = chain.ToString(NumberFormatInfo.InvariantInfo) }; } // search for an existing monomer or create a new one. var oMonomer = oBP.GetMonomer(cResidue.ToString(), chain.ToString(NumberFormatInfo.InvariantInfo)); if (oMonomer == null) { var monomer = new PDBMonomer { MonomerName = cResidue.ToString(), MonomerType = oAtom.ResName, ChainID = oAtom.ChainID, ICode = oAtom.ICode, ResSeq = oAtom.ResSeq }; oMonomer = monomer; } // add the atom oBP.AddAtom(oAtom, oMonomer, oStrand); } else { molecularStructure.Atoms.Add(oAtom); } if (readConnect.IsSet) { var isDup = atomNumberMap.ContainsKey(oAtom.Serial.Value); atomNumberMap[oAtom.Serial.Value] = oAtom; if (isDup) { Trace.TraceWarning($"Duplicate serial ID found for atom: {oAtom}"); } } Debug.WriteLine($"Added ATOM: {oAtom}"); // As HETATMs cannot be considered to either belong to a certain monomer or strand, // they are dealt with separately. #endregion } break; case "HETATM": { #region // read an atom record var oAtom = ReadAtom(cRead, lineLength); oAtom.HetAtom = true; if (isProteinStructure) { oBP.Atoms.Add(oAtom); } else { molecularStructure.Atoms.Add(oAtom); } var isDup = atomNumberMap.ContainsKey(oAtom.Serial.Value); atomNumberMap[oAtom.Serial.Value] = oAtom; if (isDup) { Trace.TraceWarning($"Duplicate serial ID found for atom: {oAtom}"); } Debug.WriteLine($"Added HETATM: {oAtom}"); #endregion } break; case "TER ": { #region // start new strand chain++; var oStrand = new PDBStrand { StrandName = chain.ToString(NumberFormatInfo.InvariantInfo) }; Debug.WriteLine("Added new STRAND"); #endregion } break; case "END ": { #region atomNumberMap.Clear(); if (isProteinStructure) { // create bonds and finish the molecule oSet.Add(oBP); if (useRebondTool.IsSet) { try { if (!CreateBondsWithRebondTool(oBP)) { // Get rid of all potentially created bonds. Trace.TraceInformation("Bonds could not be created using the RebondTool when PDB file was read."); oBP.Bonds.Clear(); } } catch (Exception exception) { Trace.TraceInformation("Bonds could not be created when PDB file was read."); Debug.WriteLine(exception); } } } else { if (useRebondTool.IsSet) { CreateBondsWithRebondTool(molecularStructure); } oSet.Add(molecularStructure); } #endregion } break; case "MODEL ": { #region // OK, start a new model and save the current one first *if* it contains atoms if (isProteinStructure) { if (oBP.Atoms.Count > 0) { // save the model oSet.Add(oBP); oModel.MoleculeSet = oSet; oSeq.Add(oModel); // setup a new one oBP = new PDBPolymer(); oModel = oFile.Builder.NewChemModel(); oSet = oFile.Builder.NewAtomContainerSet(); // avoid duplicate atom warnings atomNumberMap.Clear(); } } else { if (molecularStructure.Atoms.Count > 0) { // save the model oSet.Add(molecularStructure); oModel.MoleculeSet = oSet; oSeq.Add(oModel); // setup a new one molecularStructure = oFile.Builder.NewAtomContainer(); oModel = oFile.Builder.NewChemModel(); oSet = oFile.Builder.NewAtomContainerSet(); } } #endregion } break; case "REMARK": { #region var comment = oFile.GetProperty <string>(CDKPropertyName.Comment, ""); if (lineLength > 12) { comment = comment + cRead.Substring(11).Trim() + "\n"; oFile.SetProperty(CDKPropertyName.Comment, comment); } else { Trace.TraceWarning("REMARK line found without any comment!"); } #endregion } break; case "COMPND": { #region var title = cRead.Substring(10).Trim(); oFile.SetProperty(CDKPropertyName.Title, title); #endregion } break; case "CONECT": { #region // Read connectivity information from CONECT records. Only // covalent bonds are dealt with. Perhaps salt bridges // should be dealt with in the same way..? if (!readConnect.IsSet) { break; } cRead = cRead.Trim(); if (cRead.Length < 16) { Debug.WriteLine($"Skipping unexpected empty CONECT line! : {cRead}"); } else { int lineIndex = 6; int atomFromNumber = -1; int atomToNumber = -1; var molecule = (isProteinStructure) ? oBP : molecularStructure; while (lineIndex + 5 <= cRead.Length) { var part = cRead.Substring(lineIndex, 5).Trim(); if (atomFromNumber == -1) { try { atomFromNumber = int.Parse(part, NumberFormatInfo.InvariantInfo); } catch (FormatException) { } } else { try { atomToNumber = int.Parse(part, NumberFormatInfo.InvariantInfo); } catch (FormatException) { atomToNumber = -1; } if (atomFromNumber != -1 && atomToNumber != -1) { AddBond(molecule, atomFromNumber, atomToNumber); Debug.WriteLine($"Bonded {atomFromNumber} with {atomToNumber}"); } } lineIndex += 5; } } #endregion } break; case "HELIX ": { #region // HELIX 1 H1A CYS A 11 LYS A 18 1 RESIDUE 18 HAS POSITIVE PHI 1D66 72 // 1 2 3 4 5 6 7 // 01234567890123456789012345678901234567890123456789012345678901234567890123456789 var structure = new PDBStructure { StructureType = PDBStructure.Helix, StartChainID = cRead[19], StartSequenceNumber = int.Parse(cRead.Substring(21, 4).Trim(), NumberFormatInfo.InvariantInfo), StartInsertionCode = cRead[25], EndChainID = cRead[31], EndSequenceNumber = int.Parse(cRead.Substring(33, 4).Trim(), NumberFormatInfo.InvariantInfo), EndInsertionCode = cRead[37] }; oBP.Add(structure); #endregion } break; case "SHEET ": { #region var structure = new PDBStructure { StructureType = PDBStructure.Sheet, StartChainID = cRead[21], StartSequenceNumber = int.Parse(cRead.Substring(22, 4).Trim(), NumberFormatInfo.InvariantInfo), StartInsertionCode = cRead[26], EndChainID = cRead[32], EndSequenceNumber = int.Parse(cRead.Substring(33, 4).Trim(), NumberFormatInfo.InvariantInfo), EndInsertionCode = cRead[37] }; oBP.Add(structure); #endregion } break; case "TURN ": { #region var structure = new PDBStructure { StructureType = PDBStructure.Turn, StartChainID = cRead[19], StartSequenceNumber = int.Parse(cRead.Substring(20, 4).Trim(), NumberFormatInfo.InvariantInfo), StartInsertionCode = cRead[24], EndChainID = cRead[30], EndSequenceNumber = int.Parse(cRead.Substring(31, 4).Trim(), NumberFormatInfo.InvariantInfo), EndInsertionCode = cRead[35] }; oBP.Add(structure); #endregion } break; default: break; // ignore all other commands } } } while (cRead != null); } catch (Exception e) { if (e is IOException || e is ArgumentException) { Trace.TraceError("Found a problem at line:"); Trace.TraceError(cRead); Trace.TraceError("01234567890123456789012345678901234567890123456789012345678901234567890123456789"); Trace.TraceError(" 1 2 3 4 5 6 7 "); Trace.TraceError($" error: {e.Message}"); Debug.WriteLine(e); Console.Error.WriteLine(e.StackTrace); } else { throw; } } // try to close the Input try { oInput.Close(); } catch (Exception e) { Debug.WriteLine(e); } // Set all the dependencies oModel.MoleculeSet = oSet; oSeq.Add(oModel); oFile.Add(oSeq); return(oFile); }