/// <summary> /// /// </summary> /// <param name="alignInfo"></param> /// <returns></returns> private string[] GetChainSequences(AlignSeqInfo alignInfo, DataTable seqTable) { DataRow[] seqRows = seqTable.Select(string.Format("PdbID = '{0}' AND AsymID = '{1}'", alignInfo.pdbId, alignInfo.asymChainId)); if (seqRows.Length == 0) { seqRows = seqTable.Select(string.Format("PdbID = '{0}' AND AuthorChain = '{1}'", alignInfo.pdbId, alignInfo.chainId)); if (seqRows.Length == 0) { return(null); } } string[] chainSequences = new string[2]; chainSequences[0] = seqRows[0]["SequenceInCoord"].ToString(); chainSequences[1] = seqRows[0]["Sequence"].ToString(); return(chainSequences); }
/// <summary> /// Add residues with no-coordinates or no-Calpha to the alignment /// </summary> /// <param name="alignInfo1"></param> /// <param name="alignInfo2"></param> public void AddDisorderResiduesToAlignment(ref AlignSeqInfo alignInfo1, ref AlignSeqInfo alignInfo2) { List <string> pdbList = new List <string> (); pdbList.Add(alignInfo1.pdbId); if (!pdbList.Contains(alignInfo2.pdbId)) { pdbList.Add(alignInfo2.pdbId); } DataTable seqTable = GetSequenceTable(pdbList, asuSeqInfoTable); string[] chainSequences1 = GetChainSequences(alignInfo1, seqTable); string[] chainSequences2 = GetChainSequences(alignInfo2, seqTable); // no disorder residues in the middle of the chain if (!IsSequenceWithDisorderResidues(chainSequences1[0]) && !IsSequenceWithDisorderResidues(chainSequences2[0])) { return; } try { if (HasMissingResidues(alignInfo1.alignSequence, alignInfo1.alignStart, alignInfo1.alignEnd)) { int[] alignXmlSeqIndexes1 = GetXmlSeqIndexes(ref alignInfo1, chainSequences1[0]); FillMissingResidues(ref alignInfo1, alignXmlSeqIndexes1, chainSequences1[1], ref alignInfo2); } } catch (Exception ex) { throw ex; } try { if (HasMissingResidues(alignInfo2.alignSequence, alignInfo2.alignStart, alignInfo2.alignEnd)) { int[] alignXmlSeqIndexes2 = GetXmlSeqIndexes(ref alignInfo2, chainSequences2[0]); FillMissingResidues(ref alignInfo2, alignXmlSeqIndexes2, chainSequences2[1], ref alignInfo1); } } catch (Exception ex) { throw ex; } }
/// <summary> /// residue numbers to xml numbers /// </summary> /// <param name="alignInfo"></param> /// <param name="seqTable"></param> /// <returns></returns> private int[] GetXmlSeqIndexes(ref AlignSeqInfo alignInfo, string coordSequence) { string nonGapAlignString = GetNonGapSequenceString(alignInfo.alignSequence); int[] xmlSeqIndexes = GetXmlIndexes(nonGapAlignString, coordSequence); /* if (xmlSeqNumbers.Length == 0) * { * since I used XML sequential numbers, no Blast needed. * xmlSeqNumbers = MatchSequencesByBlast(nonGapAlignString, coordSequence); * }*/ if (xmlSeqIndexes.Length == 0) { return(null); } alignInfo.alignStart = xmlSeqIndexes[0] + 1; // add 1 on January 4, 2017 alignInfo.alignEnd = xmlSeqIndexes[xmlSeqIndexes.Length - 1] + 1; return(xmlSeqIndexes); }
/// <summary> /// parse one fatcat alignment output file /// insert data into database /// </summary> /// <param name="alignFile"></param> public void ParseFatcatAlignmentFile(string alignFile) { /* modified on April 5, 2010, change the input files of FATCAT from Guoli's regular files * into my regular file with XMl sequential numbers and asymID * instead of PDB sequence numbers. * so no sequence nubmers conversion needed */ if (logWriter == null) { logWriter = new StreamWriter("fatcatAlignmentsLog.txt", true); } logWriter.WriteLine(alignFile); StreamReader dataReader = new StreamReader(alignFile); string line = ""; int scoreIdx = -1; int alignLenIdx = -1; int gapIdx = -1; int gapEndIdx = -1; string alignSequence1 = ""; string alignSequence2 = ""; int alignStart1 = -1; int alignEnd1 = -1; int alignStart2 = -1; int alignEnd2 = -1; string[] fields = null; bool chain1Started = false; bool chain2Started = false; AlignSeqInfo alignInfo1 = new AlignSeqInfo(); AlignSeqInfo alignInfo2 = new AlignSeqInfo(); DataRow dataRow = null; Dictionary <string, string> entryAuthChainHash = new Dictionary <string, string> (); string dataLine = ""; while ((line = dataReader.ReadLine()) != null) { if (line == "") { continue; } if (line.IndexOf("Align") > -1 && line.Substring(0, "Align".Length) == "Align") { // the fatcat format: Align 3v2dX.pdb 70 with 4garZ.pdb 58 fields = ParseHelper.SplitPlus(line, ' '); // get the pdbid and chain id from the fileName string[] entryChainFields1 = GetEntryChainFields(fields[1]); string[] entryChainFields2 = GetEntryChainFields(fields[4]); dataRow = FatcatTables.fatcatAlignTable.NewRow(); dataRow["QueryEntry"] = entryChainFields1[0]; dataRow["QueryLength"] = fields[2]; dataRow["HitEntry"] = entryChainFields2[0]; dataRow["HitLength"] = fields[5]; alignInfo1.pdbId = entryChainFields1[0]; alignInfo1.asymChainId = entryChainFields1[1]; alignInfo1.chainId = GetAuthorChainFromAsymID(alignInfo1.pdbId, alignInfo1.asymChainId, ref entryAuthChainHash); dataRow["QueryChain"] = alignInfo1.chainId; dataRow["QueryAsymChain"] = alignInfo1.asymChainId; alignInfo2.pdbId = entryChainFields2[0]; alignInfo2.asymChainId = entryChainFields2[1]; alignInfo2.chainId = GetAuthorChainFromAsymID(alignInfo2.pdbId, alignInfo2.asymChainId, ref entryAuthChainHash); dataRow["HitAsymChain"] = alignInfo2.asymChainId; dataRow["HitChain"] = alignInfo2.chainId; alignSequence1 = ""; alignSequence2 = ""; chain1Started = true; chain2Started = true; dataLine = ""; } dataLine += (line + "\r\n"); scoreIdx = line.IndexOf("Score"); if (scoreIdx > -1) { // from opt-equ, equivalent positions alignLenIdx = line.IndexOf("align-len"); gapIdx = line.IndexOf("gaps"); gapEndIdx = line.LastIndexOf("("); dataRow["Score"] = line.Substring(scoreIdx + "Score".Length + 1, alignLenIdx - scoreIdx - "Score".Length - 1); dataRow["Align_Len"] = line.Substring(alignLenIdx + "align-len".Length + 1, gapIdx - alignLenIdx - "align-len".Length - 2); dataRow["Gaps"] = line.Substring(gapIdx + "gaps".Length + 1, gapEndIdx - gapIdx - "gaps".Length - 2); } if (line.IndexOf("P-value") > -1) { fields = ParseHelper.SplitPlus(line, ' '); dataRow["E_Value"] = Convert.ToDouble(fields[1]); dataRow["Identity"] = fields[5].TrimEnd('%'); dataRow["Similarity"] = fields[7].TrimEnd('%'); } if (line.IndexOf("Chain 1: ") > -1) { fields = ParseHelper.SplitPlus(line, ' '); if (chain1Started) { alignStart1 = ConvertSeqToInt(fields[2]); chain1Started = false; } alignSequence1 += fields[3]; alignEnd1 = ConvertSeqToInt(fields[2]) + GetNonGapAlignedString(fields[3]).Length - 1; } if (line.IndexOf("Chain 2:") > -1) { line = line.Replace(':', ' '); fields = ParseHelper.SplitPlus(line, ' '); if (chain2Started) { alignStart2 = ConvertSeqToInt(fields[2]); chain2Started = false; } alignSequence2 += fields[3]; alignEnd2 = ConvertSeqToInt(fields[2]) + GetNonGapAlignedString(fields[3]).Length - 1; } if (line.IndexOf("Note:") > -1) { alignInfo1.alignStart = alignStart1; alignInfo1.alignEnd = alignEnd1; alignInfo1.alignSequence = alignSequence1; alignInfo2.alignStart = alignStart2; alignInfo2.alignEnd = alignEnd2; alignInfo2.alignSequence = alignSequence2; /* if (IsAlignmentInDb(alignInfo1.pdbId, alignInfo1.chainId, alignInfo2.pdbId, alignInfo2.chainId)) * { * continue; * } */ // Convert aligned sequences to xml sequences // add these residues with no-coordinate and no -Calpha to the alignment // modified on August 31, 2012 try { seqConverter.AddDisorderResiduesToAlignment(ref alignInfo1, ref alignInfo2); } catch (Exception ex) { logWriter.WriteLine(alignInfo1.pdbId + alignInfo1.asymChainId + " " + alignInfo2.pdbId + alignInfo2.asymChainId + " filling out disorder residues failed: " + ex.Message); logWriter.Flush(); } dataRow["AlignmentLength"] = GetAlignmentLength(alignSequence1, alignSequence2); dataRow["QuerySequence"] = alignInfo1.alignSequence; dataRow["HitSequence"] = alignInfo2.alignSequence; // modified on April 10, 2010. Since input files for FATCAT use XML sequential numbers dataRow["QueryStart"] = alignInfo1.alignStart; dataRow["QueryEnd"] = alignInfo1.alignEnd; dataRow["HitStart"] = alignInfo2.alignStart; dataRow["HitEnd"] = alignInfo2.alignEnd; // DeleteDbData(alignInfo1.pdbId, alignInfo1.chainId, alignInfo2.pdbId, alignInfo2.chainId); FatcatTables.fatcatAlignTable.Rows.Add(dataRow); /* try * { * dbInsert.InsertDataIntoDb(alignmentDbConnection, dataRow); * } * catch (Exception ex) * { * logWriter.WriteLine(alignFile + ": error " + ex.Message + "\r\n" + ParseHelper.FormatDataRow (dataRow) + " "); * logWriter.Flush(); * }*/ } } dataReader.Close(); try { dbInsert.BatchInsertDataIntoDBtables(ProtCidSettings.alignmentDbConnection, FatcatTables.fatcatAlignTable); FatcatTables.fatcatAlignTable.Clear(); // "too many open handles to database", try to close the handles before leave by commit or rollback dbUpdate.Update(ProtCidSettings.alignmentDbConnection, "Commit;"); } catch (Exception ex) { logWriter.WriteLine(alignFile + ": error " + ex.Message + "\r\n" + ParseHelper.FormatDataRow(dataRow) + " "); logWriter.Flush(); } }
/// <summary> /// Fill out missing residues in aligned sequence /// </summary> /// <param name="alignInfo1">alignment info for first aligned chain</param> /// <param name="alignXmlSeq1">xml sequential numbers for first aligned chain</param> /// <param name="seqString">xml sequence for the first aligned chain</param> /// <param name="alignInfo2">alignment info for the second chain</param> private void FillMissingResidues(ref AlignSeqInfo alignInfo1, int[] xmlSeqIndexes, string seqString, ref AlignSeqInfo alignInfo2) { Dictionary <int, int> xmlSeqAlignIdxHash = new Dictionary <int, int> (); int startAlignIdx = 0; int alignIdx = -1; Array.Sort(xmlSeqIndexes); int seqIdx = 0; for (int i = 0; i < xmlSeqIndexes.Length; i++) { alignIdx = GetAlignIndex(alignInfo1.alignSequence, i, startAlignIdx, ref seqIdx); if (alignIdx < 0) { throw new Exception("Get aligned index error for " + xmlSeqIndexes[i].ToString()); } xmlSeqAlignIdxHash.Add(xmlSeqIndexes[i], alignIdx); startAlignIdx = alignIdx; } int endAlignIdx = -1; int xmlSeqDif = 0; int alignSeqDif = 0; try { for (int i = 0; i < xmlSeqIndexes.Length - 1; i++) { if (xmlSeqIndexes[i + 1] > xmlSeqIndexes[i] + 1) { xmlSeqDif = xmlSeqIndexes[i + 1] - xmlSeqIndexes[i]; startAlignIdx = (int)xmlSeqAlignIdxHash[xmlSeqIndexes[i]]; endAlignIdx = (int)xmlSeqAlignIdxHash[xmlSeqIndexes[i + 1]]; alignSeqDif = endAlignIdx - startAlignIdx; if (xmlSeqDif > alignSeqDif) // need inserted { // propogate the difference for following aligned residues int dif = xmlSeqDif - alignSeqDif; for (int j = i + 1; j < xmlSeqIndexes.Length; j++) { xmlSeqAlignIdxHash[xmlSeqIndexes[j]] = (int)xmlSeqAlignIdxHash[xmlSeqIndexes[j]] + dif; } string insertTemp = ""; int k = 0; while (k < dif) { insertTemp += "-"; k++; } // place holder alignInfo1.alignSequence = alignInfo1.alignSequence.Insert(startAlignIdx + 1, insertTemp); alignInfo2.alignSequence = alignInfo2.alignSequence.Insert(startAlignIdx + 1, insertTemp); } // remove gaps first // fill out by real residue names from asymunit string missingResidueString = seqString.Substring(xmlSeqIndexes[i] + 1, xmlSeqDif - 1); alignInfo1.alignSequence = alignInfo1.alignSequence.Remove(startAlignIdx + 1, missingResidueString.Length); alignInfo1.alignSequence = alignInfo1.alignSequence.Insert(startAlignIdx + 1, missingResidueString); } } } catch (Exception ex) { throw ex; } }