/// <summary> /// /// </summary> /// <param name="alignInfo"></param> private void FindEndPosition(ref AlignSeqInfo alignInfo) { string noGapAlignSequence = GetNonGapAlignedString(alignInfo.alignSequence); string sequenceInCoord = GetChainSequenceInCoordinates(alignInfo.pdbId, Convert.ToInt32(alignInfo.asymChainId)); string noGapCoordSeq = ""; Hashtable coordSeqIdSeqIdHash = GetCoordSeqToSeqHash(sequenceInCoord, out noGapCoordSeq, alignInfo.alignStart); int alignStartIdx = noGapCoordSeq.IndexOf(noGapAlignSequence); if (alignStartIdx > -1) { int alignEndIdx = alignStartIdx + noGapAlignSequence.Length; // get the XML sequential id int alignEnd = (int)coordSeqIdSeqIdHash[alignEndIdx]; alignInfo.alignEnd = alignEnd; } }
/// <summary> /// the aligned seqeuence only for those with coordinates /// Fatcat don't provide the end position which is in the PDB file /// have to find the real start and end positions in XML residue sequential number /// </summary> /// <param name="pdbId"></param> /// <param name="asymChain"></param> /// <param name="startPos"></param> /// <param name="alignInfo"></param> private void FindStartEndPosition(string pdbId, int entityID, int startPos, ref AlignSeqInfo alignInfo) { string noGapAlignSequence = GetNonGapAlignedString(alignInfo.alignSequence); string sequenceInCoord = GetChainSequenceInCoordinates(pdbId, entityID); string noGapCoordSeq = ""; Hashtable coordSeqIdSeqIdHash = GetCoordSeqToSeqHash(sequenceInCoord, out noGapCoordSeq, startPos); int alignStartIdx = noGapCoordSeq.IndexOf(noGapAlignSequence); if (alignStartIdx > -1) { alignInfo.alignStart = (int)coordSeqIdSeqIdHash[alignStartIdx + 1]; int alignEndIdx = alignStartIdx + noGapAlignSequence.Length; // get the XML sequential id int alignEnd = (int)coordSeqIdSeqIdHash[alignEndIdx]; alignInfo.alignEnd = alignEnd; } }
/// <summary> /// parse one fatcat alignment output file /// insert data into database /// </summary> /// <param name="alignFile"></param> public void ParsePfamFatcatAlignmentFile(string alignFile, bool isUpdate) { StreamReader dataReader = new StreamReader(alignFile); string line = ""; int scoreIdx = -1; int alignLenIdx = -1; int gapIdx = -1; int gapEndIdx = -1; string alignSequence1 = ""; string alignSequence2 = ""; int alignStart1 = -1; int alignEnd1 = -1; int alignStart2 = -1; int alignEnd2 = -1; string[] fields = null; bool chain1Started = false; bool chain2Started = false; AlignSeqInfo alignInfo1 = new AlignSeqInfo(); AlignSeqInfo alignInfo2 = new AlignSeqInfo(); DataRow dataRow = FatcatTables.fatcatAlignTable.NewRow(); string dataLine = ""; // the asymchain and startpos for this domain Hashtable domainChainInfoHash = new Hashtable(); while ((line = dataReader.ReadLine()) != null) { if (line == "") { continue; } try { dataLine += (line + "\r\n"); if (line.IndexOf("Align") > -1 && line.Substring(0, "Align".Length) == "Align") { fields = ParseHelper.SplitPlus(line, ' '); // domain 1 string[] domainInfo1 = ParseDomainName(fields[1], ref domainChainInfoHash); dataRow["QueryEntry"] = domainInfo1[0]; dataRow["QueryDomainID"] = domainInfo1[1]; dataRow["QueryEntity"] = domainInfo1[2]; dataRow["QueryDomainStart"] = domainInfo1[3]; dataRow["QueryLength"] = fields[2]; // domain 2 string[] domainInfo2 = ParseDomainName(fields[4], ref domainChainInfoHash); dataRow["HitEntry"] = domainInfo2[0]; dataRow["HitDomainID"] = domainInfo2[1]; dataRow["HitEntity"] = domainInfo2[2]; dataRow["HitDomainStart"] = domainInfo2[3]; dataRow["HitLength"] = fields[5]; alignInfo1.pdbId = fields[1].Substring(0, 4); alignInfo1.asymChainId = domainInfo1[2]; alignInfo2.pdbId = fields[4].Substring(0, 4); alignInfo2.asymChainId = domainInfo2[2]; alignSequence1 = ""; alignSequence2 = ""; chain1Started = true; chain2Started = true; } scoreIdx = line.IndexOf("Score"); if (scoreIdx > -1) { // from opt-equ, equivalent positions // dataRow["AlignmentLength"] = alignLenIdx = line.IndexOf("align-len"); gapIdx = line.IndexOf("gaps"); gapEndIdx = line.LastIndexOf("("); dataRow["Score"] = line.Substring(scoreIdx + "Score".Length + 1, alignLenIdx - scoreIdx - "Score".Length - 1); dataRow["Align_Len"] = line.Substring(alignLenIdx + "align-len".Length + 1, gapIdx - alignLenIdx - "align-len".Length - 2); dataRow["Gaps"] = line.Substring(gapIdx + "gaps".Length + 1, gapEndIdx - gapIdx - "gaps".Length - 2); } if (line.IndexOf("P-value") > -1) { fields = ParseHelper.SplitPlus(line, ' '); dataRow["E_Value"] = Convert.ToDouble(fields[1]); dataRow["Identity"] = fields[5].TrimEnd('%'); dataRow["Similarity"] = fields[7].TrimEnd('%'); } if (line.IndexOf("Chain 1:") > -1) { // contain alignStart and aligned sequence fields = ParseChainAlignSeqLine(line); if (chain1Started) { alignStart1 = ConvertSeqToInt(fields[0]); chain1Started = false; } alignSequence1 += fields[1]; alignEnd1 = ConvertSeqToInt(fields[0]) + GetNonGapAlignedString(fields[1]).Length - 1; } if (line.IndexOf("Chain 2:") > -1) { fields = ParseChainAlignSeqLine(line); if (chain2Started) { alignStart2 = ConvertSeqToInt(fields[0]); chain2Started = false; } alignSequence2 += fields[1]; alignEnd2 = ConvertSeqToInt(fields[0]) + GetNonGapAlignedString(fields[1]).Length - 1; } if (line.IndexOf("Note:") > -1) { if (alignSequence1 == "") { continue; } alignInfo1.alignStart = alignStart1; alignInfo1.alignEnd = alignEnd1; alignInfo1.alignSequence = alignSequence1; if (alignInfo1.alignStart < 0) { FindStartEndPosition(dataRow["QueryEntry"].ToString(), Convert.ToInt32(dataRow["QueryEntity"].ToString()), Convert.ToInt16(dataRow["QueryDomainStart"].ToString()), ref alignInfo1); } alignInfo2.alignStart = alignStart2; alignInfo2.alignEnd = alignEnd2; alignInfo2.alignSequence = alignSequence2; if (alignInfo2.alignStart < 0) { FindStartEndPosition(dataRow["HitEntry"].ToString(), Convert.ToInt32(dataRow["HitEntity"].ToString()), Convert.ToInt16(dataRow["HitDomainStart"].ToString()), ref alignInfo2); } dataRow["AlignmentLength"] = GetAlignmentLength(alignSequence1, alignSequence2); dataRow["QuerySequence"] = alignInfo1.alignSequence; dataRow["HitSequence"] = alignInfo2.alignSequence; dataRow["QueryStart"] = alignInfo1.alignStart; dataRow["QueryEnd"] = alignInfo1.alignEnd; dataRow["HitStart"] = alignInfo2.alignStart; dataRow["HitEnd"] = alignInfo2.alignEnd; // delete the previous data if (isUpdate) { DeletePfamAlignment(dataRow["QueryEntry"].ToString(), Convert.ToInt64(dataRow["QueryDomainID"].ToString()), dataRow["HitEntry"].ToString(), Convert.ToInt64(dataRow["HitDomainID"].ToString())); } if (FatcatTables.fatcatAlignTable.Columns.Contains("QuerySeqNumbers")) { AddQueryHitSeqNumbers(dataRow); } dbInsert.InsertDataIntoDb(AppSettings.alignmentDbConnection, dataRow); alignSequence1 = ""; alignSequence2 = ""; dataLine = ""; } if (line.IndexOf("#Time used") > -1) { break; } } catch (Exception ex) { logWriter.WriteLine(ex.Message); logWriter.WriteLine(line); logWriter.WriteLine(dataLine); logWriter.Flush(); dataLine = ""; } } dataReader.Close(); }