/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); string message = string.Empty; // Check for '@' symbol at the first line. if (!bioReader.HasLines || !bioReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = bioReader.GetLineField(2).Trim(); _numberOfCharactersParsed += bioReader.Line.Length; sequencePointer.StartingIndex = _numberOfCharactersParsed; sequencePointer.StartingLine = bioReader.LineNumber; // Go to second line. bioReader.GoToNextLine(); if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = bioReader.Line; _numberOfCharactersParsed += bioReader.Line.Length; sequencePointer.EndingIndex = _numberOfCharactersParsed; // Goto third line. bioReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!bioReader.HasLines || !bioReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } _numberOfCharactersParsed += bioReader.Line.Length; string qualScoreId = bioReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. bioReader.GoToNextLine(); if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } _numberOfCharactersParsed += bioReader.Line.Length; // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(bioReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != bioReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; _sequencePointers.Add(sequencePointer); FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualQualitativeSequenceProvider = dataProvider; return(sequence); }
// Processes headers, which are a type of comment. private void ParseHeaders(BioTextReader bioReader) { while (bioReader.HasLines && bioReader.Line.StartsWith(_commentMark, StringComparison.Ordinal)) { Sequence specificSeq = null; // process headers, but ignore other comments if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string[] fields = bioReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); switch (fields[0].ToUpperInvariant()) { case "GFF-VERSION": if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffUnsupportedVersion, bioReader.LocationString); Trace.Report(message); throw new NotSupportedException(message); } // don't store this break; case "SOURCE-VERSION": _commonSeq.Metadata["source"] = fields[1]; _commonSeq.Metadata["version"] = fields[2]; break; case "DATE": DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } _commonSeq.Metadata["date"] = date; break; case "TYPE": if (fields.Length == 2) { _commonSeq.MoleculeType = GetMoleculeType(fields[1]); if (_commonSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } } else { specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), bioReader); if (specificSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } } break; case "DNA": case "RNA": case "PROTEIN": specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), bioReader); bioReader.GoToNextLine(); while (bioReader.HasLines && bioReader.Line != "##end-" + fields[0]) { if (!bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidSequence, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } specificSeq.InsertRange(specificSeq.Count, bioReader.GetLineField(3)); bioReader.GoToNextLine(); } break; case "SEQUENCE-REGION": specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, bioReader); specificSeq.Metadata["start"] = fields[2]; specificSeq.Metadata["end"] = fields[3]; break; } } bioReader.GoToNextLine(); } }
/// <summary> /// Parses a single FASTA text from a reader into a sequence. /// </summary> /// <param name="bioReader">bio text reader</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = null; if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message; if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVAILD_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = bioReader.GetLineField(2).Trim(); if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize) { _lineCount++; _lineLength += bioReader.Line.Length; sequencePointer = new SequencePointer { StartingLine = _lineCount }; } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } bool sameSequence = false; sequence.ID = id; while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } // full load if (_blockSize <= 0) { sequence.InsertRange(sequence.Count, bioReader.Line); } else { if (sameSequence == false) { _sequenceBeginsAt = _lineLength; sameSequence = true; } _lineLength += bioReader.Line.Length; _lineCount++; } bioReader.GoToNextLine(); } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } if (sequencePointer != null) { sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; sequencePointer.StartingIndex = _sequenceBeginsAt; sequencePointer.EndingIndex = _lineLength; _sequencePointers.Add(sequencePointer); } _sequenceCount++; FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }
// LOCUS is the first line in a GenBank record private void ParseLocus(BioTextReader bioReader, ref Sequence sequence) { GenBankLocusInfo locusInfo = new GenBankLocusInfo(); // GenBank spec recommends token rather than position-based parsing, but this // is only partially possible without making extra assumptions about the presence // of optional fields. string[] tokens = bioReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); sequence.ID = tokens[0]; locusInfo.Name = tokens[0]; int sequenceLength; if (!int.TryParse(tokens[1], out sequenceLength)) { throw new InvalidOperationException(); } locusInfo.SequenceLength = sequenceLength; string seqType = tokens[2]; if (seqType != "bp" && seqType != "aa") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } // Determine format version and parse the remaining fields by position. string strandType; string strandTopology; string division; string rawDate; string molType = string.Empty; if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa")) { // older format strandType = bioReader.GetLineField(34, 36).Trim(); strandTopology = bioReader.GetLineField(43, 52).Trim(); division = bioReader.GetLineField(53, 56).Trim(); rawDate = bioReader.GetLineField(63).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(37, 42).Trim(); } } else { // newer format strandType = bioReader.GetLineField(45, 47).Trim(); strandTopology = bioReader.GetLineField(56, 63).Trim(); division = bioReader.GetLineField(65, 67).Trim(); rawDate = bioReader.GetLineField(69).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(48, 53).Trim(); } } // process strand type if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.Strand = Helper.GetStrandType(strandType); // process strand topology if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidStrand, strandTopology); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology); // process division try { locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division); } catch (ArgumentException) { locusInfo.DivisionCode = SequenceDivisionCode.None; } // process date DateTime date; if (!DateTime.TryParse(rawDate, out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, rawDate); Trace.Report(message); throw new FormatException(message); } locusInfo.Date = date; locusInfo.SequenceType = seqType; // process sequence type and molecule type MoleculeType moleculeType; if (seqType == "aa") { moleculeType = MoleculeType.Protein; } else { moleculeType = GetMoleculeType(molType); if (moleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new FormatException(message); } } IAlphabet alphabet = GetAlphabet(moleculeType); if (alphabet != sequence.Alphabet) { if (Alphabet != null && Alphabet != alphabet) { string message = Properties.Resource.ParserIncorrectAlphabet; Trace.Report(message); throw new InvalidDataException(message); } sequence = new Sequence(alphabet, Encoding, sequence); sequence.IsReadOnly = false; } sequence.MoleculeType = moleculeType; locusInfo.MoleculeType = moleculeType; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Locus = locusInfo; bioReader.GoToNextLine(); }