private string ExtractDescriptionFromHeader(string headerLine) { var description = headerLine; try { if (headerLine.StartsWith(ProteinLineStartChar.ToString())) { // Remove the > character from the start of the line headerLine = headerLine.TrimStart(ProteinLineStartChar).Trim(); } var charIndex = headerLine.IndexOfAny(ProteinLineAccessionEndChars); if (charIndex > 0) { description = headerLine.Substring(charIndex + 1).Trim(); } else { description = headerLine; } } catch (Exception) { // Ignore any errors } return(description); }
/// <summary> /// Header line (protein name and description) /// </summary> /// <param name="includeStartChar">If true then include the start character (<)</param> public string GetHeaderLine(bool includeStartChar) { try { if (!includeStartChar && mCurrentEntry.HeaderLine.StartsWith(ProteinLineStartChar.ToString())) { // Remove the > character from the start of the line return(mCurrentEntry.HeaderLine.TrimStart(ProteinLineStartChar).Trim()); } return(mCurrentEntry.HeaderLine); } catch (Exception) { return(mCurrentEntry.HeaderLine); } }
/// <summary> /// Extract the accession name from the header line by looking for mProteinLineAccessionEndChar /// </summary> /// <param name="headerLine"></param> /// <returns>Accession (protein) name</returns> /// <remarks>HeaderLine should not start with the > character; it should have already been removed when the file was read</remarks> private string ExtractAccessionNameFromHeader(string headerLine) { try { if (headerLine.StartsWith(ProteinLineStartChar.ToString())) { // Remove the > character from the start of the line headerLine = headerLine.TrimStart(ProteinLineStartChar).Trim(); } var charIndex = headerLine.IndexOfAny(ProteinLineAccessionEndChars); if (charIndex > 0) { return(headerLine.Substring(0, charIndex).Trim()); } return(headerLine); } catch (Exception) { // Ignore any errors; return the full line return(headerLine); } }
/// <summary> /// Reads the next entry in a Fasta file /// </summary> /// <returns>True if an entry is found, otherwise false</returns> public override bool ReadNextProteinEntry() { mCurrentEntry.Clear(); var proteinEntryFound = false; // This is always 0 for Fasta files mFileLineSkipCount = 0; if (mProteinFileInputStream == null) { return(false); } try { var proteinLineStartChar = ProteinLineStartChar.ToString(); while (!proteinEntryFound && !mProteinFileInputStream.EndOfStream) { string lineIn; if (!string.IsNullOrWhiteSpace(mCachedHeaderLine)) { lineIn = string.Copy(mCachedHeaderLine); mCachedHeaderLine = string.Empty; } else { lineIn = mProteinFileInputStream.ReadLine(); mFileLinesRead++; if (lineIn == null) { mFileBytesRead += 2; } else { mFileBytesRead += lineIn.Length + 2; } } if (string.IsNullOrWhiteSpace(lineIn)) { continue; } var dataLine = lineIn.Trim(); // See if lineIn starts with the protein header start character if (!dataLine.StartsWith(ProteinLineStartChar.ToString())) { continue; } mCurrentEntry.HeaderLine = dataLine; mCurrentEntry.Name = ExtractAccessionNameFromHeader(dataLine); mCurrentEntry.Description = ExtractDescriptionFromHeader(dataLine); mCurrentEntry.Sequence = string.Empty; mProteinResidues.Clear(); proteinEntryFound = true; // Now continue reading until the next protein header start character is found while (!mProteinFileInputStream.EndOfStream) { var lineIn2 = mProteinFileInputStream.ReadLine(); mFileLinesRead++; if (lineIn2 == null) { mFileBytesRead += 2; continue; } mFileBytesRead += lineIn2.Length + 2; if (lineIn2.TrimStart().StartsWith(proteinLineStartChar)) { // Found the next protein entry // Store in mCachedHeaderLine and jump out of the loop mCachedHeaderLine = string.Copy(lineIn2); break; } if (DiscardProteinResidues) { continue; } // lineIn2 has additional residues for the current protein mProteinResidues.Append(lineIn2.Trim()); } mCurrentEntry.Sequence = mProteinResidues.ToString(); } if (!proteinEntryFound) { AdjustBytesReadForEOF(); } } catch (Exception ex) { // Error reading the input file proteinEntryFound = false; Console.WriteLine("Exception in FastaFileReader.ReadNextProteinEntry: " + ex.Message); } return(proteinEntryFound); }