/// <summary> /// Parses a single biological sequence text from a reader. /// </summary> /// <param name="reader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The parsed ISequence object.</returns> public ISequence ParseOne(TextReader reader, bool isReadOnly) { _lineCount = 0; _sequenceCount = 0; _lineLength = 0; _sequenceBeginsAt = 1; using (BioTextReader bioReader = new BioTextReader(reader)) { return(ParseOne(bioReader, isReadOnly)); } }
/// <summary> /// Parses SAM alignment header from specified file. /// </summary> /// <param name="fileName">file name.</param> public static SAMAlignmentHeader ParserSAMHeader(string fileName) { if (string.IsNullOrWhiteSpace(fileName)) { throw new ArgumentNullException("fileName"); } using (BioTextReader bioReader = new BioTextReader(fileName)) { return(ParserSAMHeader(bioReader)); } }
/// <summary> /// Parses a single sequences using a BioTextReader. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence Alignment instance containing parsed data.</returns> private ISequenceAlignment ParseOne(BioTextReader bioReader, bool isReadOnly) { // no empty files allowed if (!bioReader.HasLines) { string message = Properties.Resource.IONoTextToParse; throw new InvalidDataException(message); } // do the actual parsing return(ParseOneWithSpecificFormat(bioReader, isReadOnly)); }
/// <summary> /// Parses a sequence alignment texts from a file. /// </summary> /// <param name="fileName">file name.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences in the sequence alignment should be in /// readonly mode or not. If this flag is set to true then the resulting sequences's /// isReadOnly property will be set to true, otherwise it will be set to false. /// </param> /// <returns>SequenceAlignmentMap object.</returns> public SequenceAlignmentMap Parse(string fileName, bool isReadOnly) { if (string.IsNullOrWhiteSpace(fileName)) { throw new ArgumentNullException("fileName"); } using (BioTextReader bioReader = new BioTextReader(fileName)) { return(Parse(bioReader, isReadOnly)); } }
/// <summary> /// Parses a sequence alignment texts from a file. /// </summary> /// <param name="reader">Text reader.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences in the sequence alignment should be in /// readonly mode or not. If this flag is set to true then the resulting sequences's /// isReadOnly property will be set to true, otherwise it will be set to false. /// </param> /// <returns>SequenceAlignmentMap object.</returns> public SequenceAlignmentMap Parse(TextReader reader, bool isReadOnly) { if (reader == null) { throw new ArgumentNullException("reader"); } using (BioTextReader bioReader = new BioTextReader(reader)) { return(Parse(bioReader, isReadOnly)); } }
/// <summary> /// Parses a list of biological sequence data from a file. /// </summary> /// <param name="filename">The name of a biological sequence file.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequences should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed IQualitativeSequence objects.</returns> new public IList <IQualitativeSequence> Parse(string filename, bool isReadOnly) { _fileName = filename; //check DV is requried if (filename != null) { _fileLoadHelper = new FileLoadHelper(filename); _blockSize = _fileLoadHelper.BlockSize; _maxNumberOfBlocks = _fileLoadHelper.MaxNumberOfBlocks; if (_isDataVirtualizationForced) { _blockSize = FileLoadHelper.DefaultBlockSize; } } else { _blockSize = FileLoadHelper.DefaultFullLoadBlockSize; _maxNumberOfBlocks = 0; } SidecarFileProvider indexedProvider = null; // Check for sidecar if (IsDataVirtualizationEnabled) { try { indexedProvider = SidecarFileProvider.GetProvider(filename); } catch (OperationCanceledException) { indexedProvider = null; } } if (indexedProvider != null) { // Create virtual list and return return(new VirtualQualitativeSequenceList(indexedProvider, this, indexedProvider.Count) { CreateSequenceAsReadOnly = isReadOnly }); } else { using (BioTextReader bioReader = new BioTextReader(filename)) { return(Parse(bioReader, isReadOnly)); } } }
/// <summary> /// Parses SAM alignment header from specified text reader. /// </summary> /// <param name="reader">Text reader.</param> public static SAMAlignmentHeader ParserSAMHeader(TextReader reader) { if (reader == null) { throw new ArgumentNullException("reader"); } using (BioTextReader bioReader = new BioTextReader(reader)) { return(ParserSAMHeader(bioReader)); } }
/// <summary> /// Parses a single FastQ text from a BioTextReader. /// </summary> /// <param name="bioReader">BioTextReader instance for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed IQualitativeSequence objects.</returns> private IQualitativeSequence ParseOne(BioTextReader bioReader, bool isReadOnly) { // no empty files allowed if (!bioReader.HasLines) { string message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, Resource.IONoTextToParse); Trace.Report(message); throw new FileFormatException(message); } // do the actual parsing return(ParseOneWithFastQFormat(bioReader, isReadOnly)); }
/// <summary> /// Parses a list of biological sequence data from a BioTextReader. /// </summary> /// <param name="bioReader">BioTextReader instance for a biological sequence data.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequences should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed IQualitativeSequence objects.</returns> new protected IList <IQualitativeSequence> Parse(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } // no empty files allowed if (!bioReader.HasLines) { string message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, Resource.IONoTextToParse); Trace.Report(message); throw new FileFormatException(message); } if (!string.IsNullOrEmpty(bioReader.FileName) && IsDataVirtualizationEnabled && SidecarFileProvider.IsIndexFileExists(bioReader.FileName)) { while (bioReader.HasLines) { ParseOne(bioReader, isReadOnly); } // Create sidecar SidecarFileProvider provider = SidecarFileProvider.CreateIndexFile(bioReader.FileName, _sequencePointers); VirtualQualitativeSequenceList virtualSequences = new VirtualQualitativeSequenceList(provider, this, _sequencePointers.Count) { CreateSequenceAsReadOnly = isReadOnly }; _sequencePointers.Clear(); return(virtualSequences); } else { List <IQualitativeSequence> qualSequences = new List <IQualitativeSequence>(); while (bioReader.HasLines) { qualSequences.Add(ParseOne(bioReader, isReadOnly)); } return(qualSequences); } }
// returns a string of the data for a header block that spans multiple lines private static string ParseMultiLineData(BioTextReader bioReader, string lineBreakSubstitution) { string data = bioReader.LineData; bioReader.GoToNextLine(); // while succeeding lines start with no header, add to data while (bioReader.HasLines && !bioReader.LineHasHeader) { data += lineBreakSubstitution + bioReader.LineData; bioReader.GoToNextLine(); } return(data); }
/// <summary> /// Parses a list of sequence alignment texts from a reader. /// </summary> /// <param name="reader">A reader for a sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences in the sequence alignment should be in /// readonly mode or not. If this flag is set to true then the resulting sequences's /// isReadOnly property will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequenceAlignment objects.</returns> IList <ISequenceAlignment> ISequenceAlignmentParser.Parse(TextReader reader, bool isReadOnly) { if (reader == null) { throw new ArgumentNullException("reader"); } List <ISequenceAlignment> alignments = new List <ISequenceAlignment>(); using (BioTextReader bioReader = new BioTextReader(reader)) { alignments.Add(Parse(bioReader, isReadOnly)); } return(alignments); }
/// <summary> /// Parses a list of sequence alignment texts from a file. /// </summary> /// <param name="fileName">The name of a sequence alignment file.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences in the sequence alignment should be in /// readonly mode or not. If this flag is set to true then the resulting sequences's /// isReadOnly property will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequenceAlignment objects.</returns> IList <ISequenceAlignment> ISequenceAlignmentParser.Parse(string fileName, bool isReadOnly) { if (string.IsNullOrWhiteSpace(fileName)) { throw new ArgumentNullException("fileName"); } List <ISequenceAlignment> alignments = new List <ISequenceAlignment>(); using (BioTextReader bioReader = new BioTextReader(fileName)) { alignments.Add(Parse(bioReader, isReadOnly)); } return(alignments); }
/// <summary> /// Parses SequenceAlignmentMap using a BioTextReader. /// </summary> /// <param name="bioReader">A reader for a sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether sequences in the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequenceAlignment objects.</returns> private SequenceAlignmentMap Parse(BioTextReader bioReader, bool isReadOnly) { // Parse Header, Loop through the blocks and parse while (bioReader.HasLines) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } return(ParseOneWithSpecificFormat(bioReader, isReadOnly)); } return(null); }
private static void ParseComments(BioTextReader bioReader, ref Sequence sequence) { IList <string> commentList = ((GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]).Comments; // don't skip blank lines in comments bioReader.SkipBlankLines = false; while (bioReader.HasLines && bioReader.LineHeader == "COMMENT") { string data = ParseMultiLineData(bioReader, Environment.NewLine); commentList.Add(data); // don't go to next line; current line still needs to be processed } // back to skipping blank lines when done with comments bioReader.SkipBlankLines = true; }
/// <summary> /// Parses a single sequences using a BioTextReader /// </summary> /// <param name="bioReader">bio text reader</param> /// <param name="isReadOnly">sequence property</param> /// <returns>a new Sequence</returns> private ISequence ParseOne(BioTextReader bioReader, bool isReadOnly) { _fileName = bioReader.FileName; // no empty files allowed if (!bioReader.HasLines) { string message = Resource.Parser_NoTextErrorMessage; Trace.Report(message); throw new InvalidOperationException(message); } // do the actual parsing ISequence sequence = ParseOneWithSpecificFormat(bioReader, isReadOnly); return(sequence); }
/// <summary> /// Parses SAM alignment header from specified BioTextReader. /// </summary> /// <param name="bioReader">Bio text reader.</param> private static SAMAlignmentHeader ParserSAMHeader(BioTextReader bioReader) { SAMAlignmentHeader samHeader = new SAMAlignmentHeader(); if (bioReader.HasLines && bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { while (bioReader.HasLines && bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { string[] tokens = bioReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); string recordTypecode = tokens[0].Substring(1); // Validate the header format. ValidateHeaderLineFormat(bioReader.Line); SAMRecordField headerLine = null; if (string.Compare(recordTypecode, "CO", StringComparison.OrdinalIgnoreCase) != 0) { List <string> tags = new List <string>(); headerLine = new SAMRecordField(recordTypecode); for (int i = 1; i < tokens.Length; i++) { string tagToken = tokens[i]; string tagName = tagToken.Substring(0, 2); tags.Add(tagName); headerLine.Tags.Add(new SAMRecordFieldTag(tagName, tagToken.Substring(3))); } samHeader.RecordFields.Add(headerLine); } else { samHeader.Comments.Add(bioReader.Line.Substring(4)); } bioReader.GoToNextLine(); } string message = samHeader.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } } return(samHeader); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; int start = (int)seqPointer.StartingIndex + startIndex; if (start >= seqPointer.EndingIndex) { return(null); } int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length; int len = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex); using (BioTextReader bioReader = new BioTextReader(_fileName)) { string str = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len); sequence.InsertRange(0, str); } // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Parses alignments in SAM format from a reader into a SequenceAlignmentMap object. /// </summary> /// <param name="bioReader">A reader for a biological sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether sequencs in the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new SequenceAlignmentMap instance containing parsed data.</returns> protected SequenceAlignmentMap ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } // no empty files allowed if (!bioReader.HasLines) { throw new FormatException(Resource.Parser_NoTextErrorMessage); } // Parse the alignment header. SAMAlignmentHeader header = ParserSAMHeader(bioReader); SequenceAlignmentMap seqAlignt = new SequenceAlignmentMap(header); // Parse aligned sequences ParseSequences(seqAlignt, bioReader, isReadOnly); return(seqAlignt); }
/// <summary> /// Parses a list of GFF sequences using a BioTextReader. /// </summary> /// <remarks> /// This method is overridden to process file-scope metadata that applies to all /// of the sequences in the file. /// </remarks> /// <param name="bioReader">A reader for a GFF text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequence objects.</returns> protected override IList <ISequence> Parse(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } _isSingleSeqGff = false; _sequences = new List <Sequence>(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = Alphabets.DNA; } if (Encoding == null) { _commonSeq = new Sequence(alphabet); } else { _commonSeq = new Sequence(alphabet, Encoding, string.Empty); } // The GFF spec says that all headers need to be at the top of the file. ParseHeaders(bioReader); // Use the multiSeqBuilder to parse all of the sequences from the file into a list. while (bioReader.HasLines) { ParseFeatures(bioReader); } CopyMetadata(isReadOnly); IEnumerable <ISequence> sequences = from seq in _sequences select seq as ISequence; return(sequences.ToList()); }
/// <summary> /// Parses a single GenBank text from a reader into a sequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new Sequence instance containing parsed data.</returns> protected override ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { Sequence sequence = null; if (Alphabet == null) { if (Encoding == null) { sequence = new Sequence(Alphabets.DNA); } else { sequence = new Sequence(Alphabets.DNA, Encoding, string.Empty); sequence.IsReadOnly = false; } } else { if (Encoding == null) { sequence = new Sequence(Alphabet); } else { sequence = new Sequence(Alphabet, Encoding, string.Empty); sequence.IsReadOnly = false; } } sequence.Metadata[Helper.GenBankMetadataKey] = new GenBankMetadata(); sequence.MoleculeType = GetMoleculeType(sequence.Alphabet); // parse the file ParseHeaders(bioReader, ref sequence); ParseFeatures(bioReader, ref sequence); ParseSequence(bioReader, ref sequence); sequence.IsReadOnly = isReadOnly; return(sequence); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (0 > startIndex) { throw new ArgumentOutOfRangeException("startIndex"); } if (0 >= count) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet); sequence.IsReadOnly = false; int start = (int)seqPointer.StartingIndex + startIndex; if (start >= seqPointer.EndingIndex) { return(null); } int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length; int len = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex); using (BioTextReader bioReader = new BioTextReader(_fileName)) { string sequenceString = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len); sequence.InsertRange(0, sequenceString); } // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Read XML BLAST data from the reader, and build one or more /// BlastRecordGroup objects (each containing one or more /// BlastSearchRecord results). /// </summary> /// <param name="reader">The text source</param> /// <returns>A list of BLAST iteration objects</returns> public IList <BlastResult> Parse(TextReader reader) { List <BlastResult> records = new List <BlastResult>(); StringBuilder sb = new StringBuilder(); using (BioTextReader bioreader = new BioTextReader(reader)) { bioreader.SkipBlankLines = false; while (bioreader.HasLines) { if (bioreader.Line.StartsWith("RPS-BLAST", StringComparison.OrdinalIgnoreCase)) { bioreader.GoToNextLine(); continue; } if (bioreader.Line.StartsWith("<?xml version", StringComparison.OrdinalIgnoreCase) && bioreader.LineNumber > 1) { records.Add(ParseXML(sb)); sb = new StringBuilder(); } sb.AppendLine(bioreader.Line); bioreader.GoToNextLine(); } } if (sb.Length > 0) { records.Add(ParseXML(sb)); } if (records.Count == 0) { string message = Properties.Resource.BlastNoRecords; Trace.Report(message); throw new FormatException(message); } return(records); }
/// <summary> /// Parse the Sequence data in the block /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="IDs">List of sequence IDs</param> /// <returns>parse sequence in alignment</returns> private static Dictionary <string, string> ParseCharacterBlock(BioTextReader bioReader, IList <string> IDs) { bool isInCharactersBlock = true; string data = string.Empty; int sequenceLength = 0; Dictionary <string, string> dataSet = new Dictionary <string, string>(); while (bioReader.HasLines && isInCharactersBlock) { bioReader.GoToNextLine(); IList <string> tokens = GetTokens(bioReader.Line); if (0 == string.Compare("DIMENSIONS", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // Parse dimensions // 1. Length of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } if (data.StartsWith("nchar=", StringComparison.OrdinalIgnoreCase)) { sequenceLength = Int32.Parse(data.Substring(6), CultureInfo.InvariantCulture); } } if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); } else if (0 == string.Compare("FORMAT", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // Parse format // 1. Notation for "missing" // 2. Notation for "gap" // 3. Notation for "matchchar" // 4. data type do { if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); } if (0 == string.Compare("MATRIX", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // "If available" ignore the data in square brackets [] while (bioReader.HasLines) { if (bioReader.Line.StartsWith("[", StringComparison.OrdinalIgnoreCase)) { bioReader.GoToNextLine(); } else { break; } } // Here are the alignment sequences while (bioReader.HasLines) { bioReader.GoToNextLine(); if (string.IsNullOrEmpty(bioReader.Line.Trim())) { continue; } tokens = GetTokens(bioReader.Line); if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase)) { isInCharactersBlock = false; break; } if (IDs.Contains(tokens[0])) { data = tokens[1]; if (dataSet.ContainsKey(tokens[0])) { data = string.Concat(dataSet[tokens[0]], data); } dataSet[tokens[0]] = data; } } } else if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase)) { isInCharactersBlock = false; } } // Read the end line "end;" bioReader.GoToNextLine(); // Validate the length of sequence foreach (string dataSequence in dataSet.Values) { if (dataSequence.Length != sequenceLength) { throw new FormatException(Properties.Resource.SequenceLengthMismatch); } } return(dataSet); }
/// <summary> /// Gets the list of sequence titles /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <returns>List of sequence IDs</returns> private static IList <string> ParseTaxaBlock(BioTextReader bioReader) { bool isInTaxaBlock = true; string data = string.Empty; int sequenceCount = 0; IList <string> IDs = new List <string>(); while (bioReader.HasLines && isInTaxaBlock) { bioReader.GoToNextLine(); IList <string> tokens = GetTokens(bioReader.Line); switch (tokens[0].ToUpper(CultureInfo.InvariantCulture)) { case "DIMENSIONS": tokens[0] = string.Empty; // Parse dimensions // 1. Read count of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } if (data.StartsWith("ntax=", StringComparison.OrdinalIgnoreCase)) { sequenceCount = Int32.Parse(data.Substring(5), CultureInfo.InvariantCulture); } } if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); break; case "TAXLABELS": case "TAXLABELS;": tokens[0] = string.Empty; // Parse taxlabels // 1. Read IDs of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } IDs.Add(data); } if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); break; case "END": case "END;": // Have reached the end of taxa block isInTaxaBlock = false; break; default: break; } } // Read the end line "end;" bioReader.GoToNextLine(); // Validate the count if (sequenceCount != IDs.Count) { throw new InvalidDataException(Properties.Resource.NtaxMismatch); } return(IDs); }
/// <summary> /// Parses a single Nexus text from a reader into a sequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } ParseHeader(bioReader); string message = string.Empty; ISequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); IList <string> ids = null; bool isInBlock = true; if (bioReader.Line.StartsWith("begin", StringComparison.OrdinalIgnoreCase)) { while (bioReader.HasLines && isInBlock) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } string blockName = GetTokens(bioReader.Line)[1]; switch (blockName.ToUpper(CultureInfo.InvariantCulture)) { case "TAXA": case "TAXA;": // This block contains the count of sequence & title of each sequence ids = (IList <string>)ParseTaxaBlock(bioReader); break; case "CHARACTERS": case "CHARACTERS;": // Block contains sequences Dictionary <string, string> dataSet = ParseCharacterBlock(bioReader, ids); IAlphabet alignmentAlphabet = null; string data = string.Empty; foreach (string ID in ids) { IAlphabet alphabet = Alphabet; Sequence sequence = null; data = dataSet[ID]; if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, data); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, data); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.SequenceAlphabetMismatch); throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, data); } else { sequence = new Sequence(alphabet, Encoding, data); } sequence.IsReadOnly = isReadOnly; sequence.ID = ID; sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence); } break; case "END": case "END;": // Have reached the end of block isInBlock = false; break; default: // skip this block while (bioReader.HasLines) { bioReader.GoToNextLine(); if (0 == string.Compare(bioReader.Line, "end;", StringComparison.OrdinalIgnoreCase)) { break; } } break; } bioReader.GoToNextLine(); } } return(sequenceAlignment); }
/// <summary> /// Parses a single Phylip text from a reader into a sequence. /// 1. First link has Count of Taxa and length of each sequence /// 2. Sequences /// a. First ten character are ID /// b. Sequence itself /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence Alignment instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message = string.Empty; // Parse first line IList <string> tokens = GetTokens(bioReader.Line); if (2 != tokens.Count) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); throw new InvalidDataException(message); } bool isFirstBlock = true; int sequenceCount = 0; int sequenceLength = 0; IList <Sequence> data = new List <Sequence>(); string id = string.Empty; string sequenceString = string.Empty; Sequence sequence = null; IAlphabet alignmentAlphabet = null; sequenceCount = Int32.Parse(tokens[0], CultureInfo.InvariantCulture); sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture); bioReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. bioReader.SkipBlankLines = false; while (bioReader.HasLines) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } for (int index = 0; index < sequenceCount; index++) { if (isFirstBlock) { tokens = GetTokens(bioReader.Line); if (1 == tokens.Count) { id = tokens[0].Substring(0, 10); sequenceString = tokens[0].Substring(10); } else { id = tokens[0]; sequenceString = tokens[1]; } IAlphabet alphabet = Alphabet; if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, sequenceString); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, sequenceString); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = Properties.Resource.SequenceAlphabetMismatch; throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, sequenceString); } else { sequence = new Sequence(alphabet, Encoding, sequenceString); } sequence.ID = id; sequence.IsReadOnly = false; data.Add(sequence); } else { sequence = data[index]; sequence.InsertRange(sequence.Count, bioReader.Line.Trim()); } bioReader.GoToNextLine(); } // Reset the first block flag isFirstBlock = false; } // Validate for the count of sequence if (sequenceCount != data.Count) { throw new InvalidDataException(Properties.Resource.SequenceCountMismatch); } SequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); foreach (Sequence dataSequence in data) { dataSequence.IsReadOnly = isReadOnly; // Validate for the count of sequence if (sequenceLength != dataSequence.Count) { throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch); } sequenceAlignment.AlignedSequences[0].Sequences.Add(dataSequence); } return(sequenceAlignment); }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); string message = string.Empty; // Check for '@' symbol at the first line. if (!bioReader.HasLines || !bioReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = bioReader.GetLineField(2).Trim(); _numberOfCharactersParsed += bioReader.Line.Length; sequencePointer.StartingIndex = _numberOfCharactersParsed; sequencePointer.StartingLine = bioReader.LineNumber; // Go to second line. bioReader.GoToNextLine(); if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = bioReader.Line; _numberOfCharactersParsed += bioReader.Line.Length; sequencePointer.EndingIndex = _numberOfCharactersParsed; // Goto third line. bioReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!bioReader.HasLines || !bioReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } _numberOfCharactersParsed += bioReader.Line.Length; string qualScoreId = bioReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. bioReader.GoToNextLine(); if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } _numberOfCharactersParsed += bioReader.Line.Length; // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(bioReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != bioReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; _sequencePointers.Add(sequencePointer); FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualQualitativeSequenceProvider = dataProvider; return(sequence); }
/// <summary> /// Parses a single FASTA text from a reader into a sequence. /// </summary> /// <param name="bioReader">bio text reader</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = null; if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message; if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVAILD_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = bioReader.GetLineField(2).Trim(); if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize) { _lineCount++; _lineLength += bioReader.Line.Length; sequencePointer = new SequencePointer { StartingLine = _lineCount }; } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } bool sameSequence = false; sequence.ID = id; while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } // full load if (_blockSize <= 0) { sequence.InsertRange(sequence.Count, bioReader.Line); } else { if (sameSequence == false) { _sequenceBeginsAt = _lineLength; sameSequence = true; } _lineLength += bioReader.Line.Length; _lineCount++; } bioReader.GoToNextLine(); } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } if (sequencePointer != null) { sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; sequencePointer.StartingIndex = _sequenceBeginsAt; sequencePointer.EndingIndex = _lineLength; _sequencePointers.Add(sequencePointer); } _sequenceCount++; FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> protected override ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { return(ParseOneWithFastQFormat(bioReader, isReadOnly)); }
// Returns a sequence corresponding to the given sequence name, setting its display // ID if it has not yet been set. If parsing for single sequence and already a sequence is exist and it // has already been assigened a display ID that doesn't matach sequenceName, and exception // is thrown. private Sequence GetSpecificSequence(string sequenceName, MoleculeType moleculeType, BioTextReader bioReader) { Sequence seq = null; // The GFF spec says that DNA is the default molecule type. if (moleculeType == MoleculeType.Invalid) { moleculeType = MoleculeType.DNA; } IAlphabet alphabet = GetAlphabet(moleculeType); if (_sequences.Count == 0) { if (Encoding == null) { seq = new Sequence(alphabet); } else { seq = new Sequence(alphabet, Encoding, string.Empty); seq.IsReadOnly = false; } seq.DisplayID = sequenceName; seq.ID = sequenceName; seq.MoleculeType = moleculeType; _sequences.Add(seq); } if (_isSingleSeqGff) { if (!_sequences[0].DisplayID.Equals(sequenceName)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.UnexpectedSecondSequenceName, bioReader.LocationString); Trace.Report(message); throw new InvalidOperationException(message); } seq = _sequences[0]; } else { seq = _sequences.FirstOrDefault(S => S.DisplayID.Equals(sequenceName)); if (seq == null) { if (Encoding == null) { seq = new Sequence(alphabet); } else { seq = new Sequence(alphabet, Encoding, string.Empty); seq.IsReadOnly = false; } seq.DisplayID = sequenceName; seq.ID = sequenceName; seq.MoleculeType = moleculeType; _sequences.Add(seq); } } return(seq); }