/// <summary> /// Parse Nexus Header /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> private void ParseHeader(BioTextReader bioReader) { string message = string.Empty; if (!bioReader.Line.StartsWith("#NEXUS", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); throw new InvalidDataException(message); } bioReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Title of Alignment if (bioReader.Line.Trim().StartsWith("[", StringComparison.OrdinalIgnoreCase)) { while (bioReader.HasLines) { bioReader.GoToNextLine(); if (bioReader.Line.Trim().EndsWith("]", StringComparison.OrdinalIgnoreCase)) { break; } } } bioReader.GoToNextLine(); // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. bioReader.SkipBlankLines = false; }
// returns a string of the data for a header block that spans multiple lines private static string ParseMultiLineData(BioTextReader bioReader, string lineBreakSubstitution) { string data = bioReader.LineData; bioReader.GoToNextLine(); // while succeeding lines start with no header, add to data while (bioReader.HasLines && !bioReader.LineHasHeader) { data += lineBreakSubstitution + bioReader.LineData; bioReader.GoToNextLine(); } return(data); }
/// <summary> /// Parses a list of sequences using a BioTextReader. /// </summary> /// <remarks> /// This method should be overridden by any parsers that need to process file-scope /// metadata that applies to all of the sequences in the file. /// </remarks> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequences should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequence objects.</returns> protected virtual IList <ISequenceAlignment> Parse(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } // no empty files allowed if (!bioReader.HasLines) { string message = Properties.Resource.IONoTextToParse; throw new InvalidDataException(message); } List <ISequenceAlignment> alignments = new List <ISequenceAlignment>(); // Parse Header, Loop through the blocks and parse while (bioReader.HasLines) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } alignments.Add(ParseOneWithSpecificFormat(bioReader, isReadOnly)); } return(alignments); }
// parses sequence. private void ParseSequences(SequenceAlignmentMap seqAlignment, BioTextReader bioReader, bool isReadOnly) { while (bioReader.HasLines && !bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { string[] tokens = bioReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); alignedSeq.QName = tokens[0]; alignedSeq.Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]); alignedSeq.RName = tokens[2]; alignedSeq.Pos = int.Parse(tokens[3], CultureInfo.InvariantCulture); alignedSeq.MapQ = int.Parse(tokens[4], CultureInfo.InvariantCulture); alignedSeq.CIGAR = tokens[5]; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7], CultureInfo.InvariantCulture); alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture); string message = alignedSeq.IsValidHeader(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } ISequence refSeq = null; if (RefSequences != null && RefSequences.Count > 0) { refSeq = RefSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0); } ParseQualityNSequence(alignedSeq, Alphabet, Encoding, tokens[9], tokens[10], refSeq, isReadOnly); SAMOptionalField optField = null; for (int i = 11; i < tokens.Length; i++) { optField = new SAMOptionalField(); string optionalFieldRegExpn = OptionalFieldLinePattern; if (!Helper.IsValidRegexValue(optionalFieldRegExpn, tokens[i])) { message = string.Format(CultureInfo.CurrentCulture, Resource.InvalidOptionalField, tokens[i]); throw new FormatException(message); } string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; message = optField.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } alignedSeq.OptionalFields.Add(optField); } seqAlignment.QuerySequences.Add(alignedSeq); bioReader.GoToNextLine(); } }
/// <summary> /// Read XML BLAST data from the reader, and build one or more /// BlastRecordGroup objects (each containing one or more /// BlastSearchRecord results). /// </summary> /// <param name="reader">The text source</param> /// <returns>A list of BLAST iteration objects</returns> public IList <BlastResult> Parse(TextReader reader) { List <BlastResult> records = new List <BlastResult>(); StringBuilder sb = new StringBuilder(); using (BioTextReader bioreader = new BioTextReader(reader)) { bioreader.SkipBlankLines = false; while (bioreader.HasLines) { if (bioreader.Line.StartsWith("RPS-BLAST", StringComparison.OrdinalIgnoreCase)) { bioreader.GoToNextLine(); continue; } if (bioreader.Line.StartsWith("<?xml version", StringComparison.OrdinalIgnoreCase) && bioreader.LineNumber > 1) { records.Add(ParseXML(sb)); sb = new StringBuilder(); } sb.AppendLine(bioreader.Line); bioreader.GoToNextLine(); } } if (sb.Length > 0) { records.Add(ParseXML(sb)); } if (records.Count == 0) { string message = Properties.Resource.BlastNoRecords; Trace.Report(message); throw new FormatException(message); } return(records); }
/// <summary> /// Parses SequenceAlignmentMap using a BioTextReader. /// </summary> /// <param name="bioReader">A reader for a sequence alignment text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether sequences in the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>The list of parsed ISequenceAlignment objects.</returns> private SequenceAlignmentMap Parse(BioTextReader bioReader, bool isReadOnly) { // Parse Header, Loop through the blocks and parse while (bioReader.HasLines) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } return(ParseOneWithSpecificFormat(bioReader, isReadOnly)); } return(null); }
/// <summary> /// Parses SAM alignment header from specified BioTextReader. /// </summary> /// <param name="bioReader">Bio text reader.</param> private static SAMAlignmentHeader ParserSAMHeader(BioTextReader bioReader) { SAMAlignmentHeader samHeader = new SAMAlignmentHeader(); if (bioReader.HasLines && bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { while (bioReader.HasLines && bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase)) { string[] tokens = bioReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); string recordTypecode = tokens[0].Substring(1); // Validate the header format. ValidateHeaderLineFormat(bioReader.Line); SAMRecordField headerLine = null; if (string.Compare(recordTypecode, "CO", StringComparison.OrdinalIgnoreCase) != 0) { List <string> tags = new List <string>(); headerLine = new SAMRecordField(recordTypecode); for (int i = 1; i < tokens.Length; i++) { string tagToken = tokens[i]; string tagName = tagToken.Substring(0, 2); tags.Add(tagName); headerLine.Tags.Add(new SAMRecordFieldTag(tagName, tagToken.Substring(3))); } samHeader.RecordFields.Add(headerLine); } else { samHeader.Comments.Add(bioReader.Line.Substring(4)); } bioReader.GoToNextLine(); } string message = samHeader.IsValid(); if (!string.IsNullOrEmpty(message)) { throw new FormatException(message); } } return(samHeader); }
// Parses the consecutive feature lines for one sequence. private void ParseFeatures(BioTextReader bioReader) { // The non-comment lines contain features, which are each stored as MetadataListItems. // The fields of each feature are referred to as sub-items. For GFF, these have // unique keys, but for compatability with our internal representation of features from // GenBank format, each sub-item is a list of strings, rather than a simple string. List <MetadataListItem <List <string> > > featureList = null; Sequence specificSeq = null; while (bioReader.HasLines) { if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { // ignore comments bioReader.GoToNextLine(); } else { // fields are tab-delimited string[] featureFields = bioReader.Line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (featureFields.Length < _minFieldsPerFeature || featureFields.Length > _maxFieldsPerFeature) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);; throw new InvalidDataException(message); } // The featureFields array should now contain the following fields: // featureFields[0]: sequence name // featureFields[1]: source // featureFields[2]: feature name // featureFields[3]: start // featureFields[4]: end // featureFields[5]: score // featureFields[6]: strand // featureFields[7]: frame // featureFields[8]: attributes (optional) // Process sequence name. if (specificSeq == null) { specificSeq = GetSpecificSequence(featureFields[0], MoleculeType.Invalid, bioReader); // Retrieve features list, or add empty features list to metadata if this // is the first feature. if (specificSeq.Metadata.ContainsKey("features")) { featureList = specificSeq.Metadata["features"] as List <MetadataListItem <List <string> > >; } else { featureList = new List <MetadataListItem <List <string> > >(); specificSeq.Metadata["features"] = featureList; } } else if (specificSeq.DisplayID != featureFields[0]) { // don't go to next line; current line still needs to be processed break; } // use feature name as key; attributes field is stored as free text string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty); MetadataListItem <List <string> > feature = new MetadataListItem <List <string> >(featureFields[2], attributes); // source feature.SubItems.Add("source", new List <string> { featureFields[1] }); // start is an int int ignoreMe; if (!int.TryParse(featureFields[3], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "start", featureFields[3]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("start", new List <string> { featureFields[3] }); // end is an int if (!int.TryParse(featureFields[4], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "end", featureFields[4]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("end", new List <string> { featureFields[4] }); // source is a double, or a dot as a space holder if (featureFields[5] != ".") { double ignoreMeToo; if (!double.TryParse(featureFields[5], out ignoreMeToo)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "score", featureFields[5]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("score", new List <string> { featureFields[5] }); } // strand is + or -, or a dot as a space holder if (featureFields[6] != ".") { if (featureFields[6] != "+" && featureFields[6] != "-") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "strand", featureFields[6]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("strand", new List <string> { featureFields[6] }); } // frame is an int, or a dot as a space holder if (featureFields[7] != ".") { if (!int.TryParse(featureFields[7], out ignoreMe)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidField, "frame", featureFields[7]); Trace.Report(message); throw new InvalidDataException(message); } feature.SubItems.Add("frame", new List <string> { featureFields[7] }); } // done with that one featureList.Add(feature); bioReader.GoToNextLine(); } } // A feature file with no features? May it never be. if (featureList == null) { string message = Properties.Resource.GFFNoFeatures; Trace.Report(message); throw new InvalidOperationException(message); } }
/// <summary> /// Parses a single Phylip text from a reader into a sequence. /// 1. First link has Count of Taxa and length of each sequence /// 2. Sequences /// a. First ten character are ID /// b. Sequence itself /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence Alignment instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message = string.Empty; // Parse first line IList <string> tokens = GetTokens(bioReader.Line); if (2 != tokens.Count) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); throw new InvalidDataException(message); } bool isFirstBlock = true; int sequenceCount = 0; int sequenceLength = 0; IList <Sequence> data = new List <Sequence>(); string id = string.Empty; string sequenceString = string.Empty; Sequence sequence = null; IAlphabet alignmentAlphabet = null; sequenceCount = Int32.Parse(tokens[0], CultureInfo.InvariantCulture); sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture); bioReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. bioReader.SkipBlankLines = false; while (bioReader.HasLines) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } for (int index = 0; index < sequenceCount; index++) { if (isFirstBlock) { tokens = GetTokens(bioReader.Line); if (1 == tokens.Count) { id = tokens[0].Substring(0, 10); sequenceString = tokens[0].Substring(10); } else { id = tokens[0]; sequenceString = tokens[1]; } IAlphabet alphabet = Alphabet; if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, sequenceString); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, sequenceString); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = Properties.Resource.SequenceAlphabetMismatch; throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, sequenceString); } else { sequence = new Sequence(alphabet, Encoding, sequenceString); } sequence.ID = id; sequence.IsReadOnly = false; data.Add(sequence); } else { sequence = data[index]; sequence.InsertRange(sequence.Count, bioReader.Line.Trim()); } bioReader.GoToNextLine(); } // Reset the first block flag isFirstBlock = false; } // Validate for the count of sequence if (sequenceCount != data.Count) { throw new InvalidDataException(Properties.Resource.SequenceCountMismatch); } SequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); foreach (Sequence dataSequence in data) { dataSequence.IsReadOnly = isReadOnly; // Validate for the count of sequence if (sequenceLength != dataSequence.Count) { throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch); } sequenceAlignment.AlignedSequences[0].Sequences.Add(dataSequence); } return(sequenceAlignment); }
/// <summary> /// Gets the list of sequence titles /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <returns>List of sequence IDs</returns> private static IList <string> ParseTaxaBlock(BioTextReader bioReader) { bool isInTaxaBlock = true; string data = string.Empty; int sequenceCount = 0; IList <string> IDs = new List <string>(); while (bioReader.HasLines && isInTaxaBlock) { bioReader.GoToNextLine(); IList <string> tokens = GetTokens(bioReader.Line); switch (tokens[0].ToUpper(CultureInfo.InvariantCulture)) { case "DIMENSIONS": tokens[0] = string.Empty; // Parse dimensions // 1. Read count of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } if (data.StartsWith("ntax=", StringComparison.OrdinalIgnoreCase)) { sequenceCount = Int32.Parse(data.Substring(5), CultureInfo.InvariantCulture); } } if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); break; case "TAXLABELS": case "TAXLABELS;": tokens[0] = string.Empty; // Parse taxlabels // 1. Read IDs of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } IDs.Add(data); } if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); break; case "END": case "END;": // Have reached the end of taxa block isInTaxaBlock = false; break; default: break; } } // Read the end line "end;" bioReader.GoToNextLine(); // Validate the count if (sequenceCount != IDs.Count) { throw new InvalidDataException(Properties.Resource.NtaxMismatch); } return(IDs); }
// parses everything before the features section private void ParseHeaders(BioTextReader bioReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; string data = string.Empty; string[] tokens = null; // set data indent for headers bioReader.DataIndent = _dataIndent; // only allow one locus line bool haveParsedLocus = false; // parse until we hit the features or sequence section bool haveFinishedHeaders = false; while (bioReader.HasLines && !haveFinishedHeaders) { switch (bioReader.LineHeader) { case "LOCUS": if (haveParsedLocus) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserSecondLocus, bioReader.LocationString); Trace.Report(message); throw new InvalidDataException(message); } ParseLocus(bioReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; haveParsedLocus = true; // don't go to next line; current line still needs to be processed break; case "VERSION": tokens = bioReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // first token contains accession and version Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$"); metadata.Version = new GenBankVersion(); if (m.Success) { metadata.Version.Version = m.Groups["version"].Value; // The first token in the data from the accession line is referred to as // the primary accession number, and should be the one used here in the // version line. string versionLineAccession = m.Groups["accession"].Value; if (metadata.Accession == null) { ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION"); } else { if (!versionLineAccession.Equals(metadata.Accession.Primary)) { ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION"); } else { metadata.Version.Accession = metadata.Accession.Primary; } } } // second token contains primary ID m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)"); if (m.Success) { metadata.Version.GINumber = m.Groups["primaryID"].Value; } bioReader.GoToNextLine(); break; case "PROJECT": tokens = bioReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.Project = new ProjectIdentifier(); metadata.Project.Name = tokens[0]; tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.Project.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + bioReader.Line); } bioReader.GoToNextLine(); break; case "SOURCE": ParseSource(bioReader, ref sequence); metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "REFERENCE": ParseReferences(bioReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "COMMENT": ParseComments(bioReader, ref sequence); // can encounter more than one metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // don't go to next line; current line still needs to be processed break; case "PRIMARY": // This header is followed by sequence info in a table format that could be // stored in a custom object. The first line contains column headers. // For now, just validate the presence of the headers, and save the data // as a string. int[] locs = new int[4]; locs[0] = bioReader.LineData.IndexOf("TPA_SPAN", StringComparison.Ordinal); locs[1] = bioReader.LineData.IndexOf("PRIMARY_IDENTIFIER", StringComparison.Ordinal); locs[2] = bioReader.LineData.IndexOf("PRIMARY_SPAN", StringComparison.Ordinal); locs[3] = bioReader.LineData.IndexOf("COMP", StringComparison.Ordinal); if (locs[0] < 0 || locs[1] < 0 || locs[2] < 0 || locs[3] < 0) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserPrimaryLineError, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } string primaryData = ParseMultiLineData(bioReader, Environment.NewLine); metadata.Primary = primaryData; // don't go to next line; current line still needs to be processed break; // all the following are extracted the same way - possibly multiline case "DEFINITION": metadata.Definition = ParseMultiLineData(bioReader, " "); break; case "ACCESSION": data = ParseMultiLineData(bioReader, " "); metadata.Accession = new GenBankAccession(); string[] accessions = data.Split(' '); metadata.Accession.Primary = accessions[0]; for (int i = 1; i < accessions.Length; i++) { metadata.Accession.Secondary.Add(accessions[i]); } break; case "DBLINK": tokens = bioReader.LineData.Split(':'); if (tokens.Length == 2) { metadata.DBLink = new CrossReferenceLink(); if (string.Compare(tokens[0], CrossReferenceType.Project.ToString(), StringComparison.OrdinalIgnoreCase) == 0) { metadata.DBLink.Type = CrossReferenceType.Project; } else { metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive; } tokens = tokens[1].Split(','); for (int i = 0; i < tokens.Length; i++) { metadata.DBLink.Numbers.Add(tokens[i]); } } else { ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + bioReader.Line); } bioReader.GoToNextLine(); break; case "DBSOURCE": metadata.DBSource = ParseMultiLineData(bioReader, " "); break; case "KEYWORDS": metadata.Keywords = ParseMultiLineData(bioReader, " "); break; case "SEGMENT": data = ParseMultiLineData(bioReader, " "); string delimeter = "of"; tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); int outvalue; if (tokens.Length == 2) { metadata.Segment = new SequenceSegment(); if (int.TryParse(tokens[0].Trim(), out outvalue)) { metadata.Segment.Current = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line); } if (int.TryParse(tokens[1].Trim(), out outvalue)) { metadata.Segment.Count = outvalue; } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line); } } else { ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line); } break; // all the following indicate sections beyond the headers parsed by this method case "FEATURES": case "BASE COUNT": case "ORIGIN": case "CONTIG": haveFinishedHeaders = true; break; default: ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", bioReader.LineHeader, bioReader.LineData); string errMessage = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParseHeaderError, bioReader.LineHeader); Trace.Report(errMessage); throw new InvalidDataException(errMessage); } } // check for required features if (!haveParsedLocus) { string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); Trace.Report(message); throw new InvalidDataException(message); } }
// Processes headers, which are a type of comment. private void ParseHeaders(BioTextReader bioReader) { while (bioReader.HasLines && bioReader.Line.StartsWith(_commentMark, StringComparison.Ordinal)) { Sequence specificSeq = null; // process headers, but ignore other comments if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string[] fields = bioReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); switch (fields[0].ToUpperInvariant()) { case "GFF-VERSION": if (fields.Length > 1 && fields[1] != "2") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffUnsupportedVersion, bioReader.LocationString); Trace.Report(message); throw new NotSupportedException(message); } // don't store this break; case "SOURCE-VERSION": _commonSeq.Metadata["source"] = fields[1]; _commonSeq.Metadata["version"] = fields[2]; break; case "DATE": DateTime date; if (!DateTime.TryParse(fields[1], out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } _commonSeq.Metadata["date"] = date; break; case "TYPE": if (fields.Length == 2) { _commonSeq.MoleculeType = GetMoleculeType(fields[1]); if (_commonSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } } else { specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), bioReader); if (specificSeq.MoleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InvalidType, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } } break; case "DNA": case "RNA": case "PROTEIN": specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), bioReader); bioReader.GoToNextLine(); while (bioReader.HasLines && bioReader.Line != "##end-" + fields[0]) { if (!bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GffInvalidSequence, bioReader.LocationString); Trace.Report(message); throw new FormatException(message); } specificSeq.InsertRange(specificSeq.Count, bioReader.GetLineField(3)); bioReader.GoToNextLine(); } break; case "SEQUENCE-REGION": specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, bioReader); specificSeq.Metadata["start"] = fields[2]; specificSeq.Metadata["end"] = fields[3]; break; } } bioReader.GoToNextLine(); } }
/// <summary> /// Parses a single Nexus text from a reader into a sequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } ParseHeader(bioReader); string message = string.Empty; ISequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); IList <string> ids = null; bool isInBlock = true; if (bioReader.Line.StartsWith("begin", StringComparison.OrdinalIgnoreCase)) { while (bioReader.HasLines && isInBlock) { if (string.IsNullOrEmpty(bioReader.Line.Trim())) { bioReader.GoToNextLine(); continue; } string blockName = GetTokens(bioReader.Line)[1]; switch (blockName.ToUpper(CultureInfo.InvariantCulture)) { case "TAXA": case "TAXA;": // This block contains the count of sequence & title of each sequence ids = (IList <string>)ParseTaxaBlock(bioReader); break; case "CHARACTERS": case "CHARACTERS;": // Block contains sequences Dictionary <string, string> dataSet = ParseCharacterBlock(bioReader, ids); IAlphabet alignmentAlphabet = null; string data = string.Empty; foreach (string ID in ids) { IAlphabet alphabet = Alphabet; Sequence sequence = null; data = dataSet[ID]; if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, data); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, data); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.SequenceAlphabetMismatch); throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, data); } else { sequence = new Sequence(alphabet, Encoding, data); } sequence.IsReadOnly = isReadOnly; sequence.ID = ID; sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence); } break; case "END": case "END;": // Have reached the end of block isInBlock = false; break; default: // skip this block while (bioReader.HasLines) { bioReader.GoToNextLine(); if (0 == string.Compare(bioReader.Line, "end;", StringComparison.OrdinalIgnoreCase)) { break; } } break; } bioReader.GoToNextLine(); } } return(sequenceAlignment); }
// Handle optional BASE COUNT, then ORIGIN and sequence data. private void ParseSequence(BioTextReader bioReader, ref Sequence sequence) { string message = string.Empty; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; // set data indent for sequence headers bioReader.DataIndent = _dataIndent; while (bioReader.HasLines) { if (bioReader.Line.StartsWith("//", StringComparison.Ordinal)) { bioReader.GoToNextLine(); break; // end of sequence record } switch (bioReader.LineHeader) { case "BASE COUNT": // The BASE COUNT linetype is obsolete and was removed // from the GenBank flatfile format in October 2003. But if it is // present, we will use it. We get the untrimmed version since it // starts with a right justified column. metadata.BaseCount = bioReader.Line.Substring(_dataIndent); bioReader.GoToNextLine(); break; case "ORIGIN": // The origin line can contain optional data; don't put empty string into // metadata. if (!String.IsNullOrEmpty(bioReader.LineData)) { metadata.Origin = bioReader.LineData; } bioReader.GoToNextLine(); IAlphabet alphabet = null; while (bioReader.HasLines && bioReader.Line[0] == ' ') { // Using a regex is too slow. int len = bioReader.Line.Length; int k = 10; while (k < len) { string seqData = bioReader.Line.Substring(k, Math.Min(10, len - k)); if (Alphabet == null) { alphabet = IdentifyAlphabet(alphabet, seqData); if (alphabet == null) { message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence); seq.MoleculeType = sequence.MoleculeType; seq.IsReadOnly = false; sequence.Clear(); sequence = seq; } } sequence.InsertRange(sequence.Count, seqData); k += 11; } bioReader.GoToNextLine(); } break; case "CONTIG": metadata.Contig = ParseMultiLineData(bioReader, Environment.NewLine); // don't go to next line; current line still needs to be processed break; default: message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserUnexpectedLineInSequence, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } } }
private void ParseFeatures(BioTextReader bioReader, ref Sequence sequence) { ILocationBuilder locBuilder = LocationBuilder; if (locBuilder == null) { throw new InvalidOperationException(Resource.NullLocationBuild); } // set data indent for features bioReader.DataIndent = _featureDataIndent; // The sub-items of a feature are referred to as qualifiers. These do not have unique // keys, so they are stored as lists in the SubItems dictionary. SequenceFeatures features = new SequenceFeatures(); IList <FeatureItem> featureList = features.All; while (bioReader.HasLines) { if (String.IsNullOrEmpty(bioReader.Line) || bioReader.LineHeader == "FEATURES") { bioReader.GoToNextLine(); continue; } if (bioReader.Line[0] != ' ') { // start of non-feature text break; } if (!bioReader.LineHasHeader) { string message = Properties.Resource.GenbankEmptyFeature; Trace.Report(message); throw new InvalidDataException(message); } // check for multi-line location string string featureKey = bioReader.LineHeader; string location = bioReader.LineData; bioReader.GoToNextLine(); while (bioReader.HasLines && !bioReader.LineHasHeader && bioReader.LineHasData && !bioReader.LineData.StartsWith("/", StringComparison.Ordinal)) { location += bioReader.LineData; bioReader.GoToNextLine(); } // create features as MetadataListItems FeatureItem feature = new FeatureItem(featureKey, locBuilder.GetLocation(location)); // process the list of qualifiers, which are each in the form of // /key="value" string qualifierKey = string.Empty; string qualifierValue = string.Empty; while (bioReader.HasLines) { if (!bioReader.LineHasHeader && bioReader.LineHasData) { // '/' denotes a continuation of the previous line if (bioReader.LineData.StartsWith("/", StringComparison.Ordinal)) { // new qualifier; save previous if this isn't the first if (!String.IsNullOrEmpty(qualifierKey)) { AddQualifierToFeature(feature, qualifierKey, qualifierValue); } // set the key and value of this qualifier int equalsIndex = bioReader.LineData.IndexOf('='); if (equalsIndex < 0) { // no value, just key (this is allowed, see NC_005213.gbk) qualifierKey = bioReader.LineData.Substring(1); qualifierValue = string.Empty; } else if (equalsIndex > 0) { qualifierKey = bioReader.LineData.Substring(1, equalsIndex - 1); qualifierValue = bioReader.LineData.Substring(equalsIndex + 1); } else { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.GenbankInvalidFeature, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } } else { // Continuation of previous line; "note" gets a line break, and // everything else except "translation" and "transl_except" gets a // space to separate words. if (qualifierKey == "note") { qualifierValue += Environment.NewLine; } else if (qualifierKey != "translation" && qualifierKey != "transl_except") { qualifierValue += " "; } qualifierValue += bioReader.LineData; } bioReader.GoToNextLine(); } else if (bioReader.Line.StartsWith("\t", StringComparison.Ordinal)) { // this seems to be data corruption; but BioPerl test set includes // (old, 2003) NT_021877.gbk which has this problem, so we // handle it ApplicationLog.WriteLine("WARN: nonstandard line format at line {0}: '{1}'", bioReader.LineNumber, bioReader.Line); qualifierValue += " " + bioReader.Line.Trim(); bioReader.GoToNextLine(); } else { break; } } // add last qualifier if (!String.IsNullOrEmpty(qualifierKey)) { AddQualifierToFeature(feature, qualifierKey, qualifierValue); } // still add feature, even if it has no qualifiers featureList.Add(StandardFeatureMap.GetStandardFeatureItem(feature)); } if (featureList.Count > 0) { ((GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]).Features = features; } }
private static void ParseSource(BioTextReader bioReader, ref Sequence sequence) { string source = string.Empty; string organism = string.Empty; string classLevels = string.Empty; while (bioReader.HasLines) { if (bioReader.LineHeader == "SOURCE") { // data can be multiline. spec says last line must end with period // (note: this doesn't apply unless multiline) bool lastDotted = true; source = bioReader.LineData; bioReader.GoToNextLine(); while (bioReader.HasLines && !bioReader.LineHasHeader) { source += " " + bioReader.LineData; lastDotted = (source.EndsWith(".", StringComparison.Ordinal)); bioReader.GoToNextLine(); } if (!lastDotted && Trace.Want(Trace.SeqWarnings)) { Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source); } // don't go to next line; current line still needs to be processed } else if (bioReader.Line[0] == ' ') { if (bioReader.LineHeader != "ORGANISM") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidSourceField, bioReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } // this also can be multiline organism = bioReader.LineData; bioReader.GoToNextLine(); while (bioReader.HasLines && !bioReader.LineHasHeader) { if (bioReader.Line.EndsWith(";", StringComparison.Ordinal) || bioReader.Line.EndsWith(".", StringComparison.Ordinal)) { if (!String.IsNullOrEmpty(classLevels)) { classLevels += " "; } classLevels += bioReader.LineData; } else { organism += " " + bioReader.LineData; } bioReader.GoToNextLine(); } // don't go to next line; current line still needs to be processed } else { // don't go to next line; current line still needs to be processed break; } } GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Source = new SequenceSource(); metadata.Source.CommonName = source; if (!string.IsNullOrEmpty(organism)) { int index = organism.IndexOf(" ", StringComparison.Ordinal); if (index > 0) { metadata.Source.Organism.Genus = organism.Substring(0, index); if (organism.Length > index) { index++; metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index); } } else { metadata.Source.Organism.Genus = organism; } } metadata.Source.Organism.ClassLevels = classLevels; }
private static void ParseReferences(BioTextReader bioReader, ref Sequence sequence) { GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; IList <CitationReference> referenceList = metadata.References; CitationReference reference = null; //List<MetadataListItem<string>> referenceList = new List<MetadataListItem<string>>(); //MetadataListItem<string> reference = null; while (bioReader.HasLines) { if (bioReader.LineHeader == "REFERENCE") { // add previous reference if (reference != null) { referenceList.Add(reference); } // check for start/end e.g. (bases 1 to 118), or prose notes Match m = Regex.Match(bioReader.LineData, @"^(?<number>\d+)(\s+\((?<location>.*)\))?"); if (!m.Success) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserReferenceError, bioReader.LineData); Trace.Report(message); throw new InvalidDataException(message); } // create new reference string number = m.Groups["number"].Value; string location = m.Groups["location"].Value; reference = new CitationReference(); int outValue; if (!int.TryParse(number, out outValue)) { throw new InvalidOperationException(); } reference.Number = outValue; reference.Location = location; bioReader.GoToNextLine(); } else if (bioReader.Line.StartsWith(" ", StringComparison.Ordinal)) { switch (bioReader.LineHeader) { // all the following are extracted the same way - possibly multiline case "AUTHORS": reference.Authors = ParseMultiLineData(bioReader, " "); break; case "CONSRTM": reference.Consortiums = ParseMultiLineData(bioReader, " "); break; case "TITLE": reference.Title = ParseMultiLineData(bioReader, " "); break; case "JOURNAL": reference.Journal = ParseMultiLineData(bioReader, " "); break; case "REMARK": reference.Remarks = ParseMultiLineData(bioReader, " "); break; case "MEDLINE": reference.Medline = ParseMultiLineData(bioReader, " "); break; case "PUBMED": reference.PubMed = ParseMultiLineData(bioReader, " "); break; default: string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidReferenceField, bioReader.LineHeader); Trace.Report(message); throw new InvalidDataException(message); } } else { // add last reference if (reference != null) { referenceList.Add(reference); } // don't go to next line; current line still needs to be processed break; } } }
// LOCUS is the first line in a GenBank record private void ParseLocus(BioTextReader bioReader, ref Sequence sequence) { GenBankLocusInfo locusInfo = new GenBankLocusInfo(); // GenBank spec recommends token rather than position-based parsing, but this // is only partially possible without making extra assumptions about the presence // of optional fields. string[] tokens = bioReader.LineData.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); sequence.ID = tokens[0]; locusInfo.Name = tokens[0]; int sequenceLength; if (!int.TryParse(tokens[1], out sequenceLength)) { throw new InvalidOperationException(); } locusInfo.SequenceLength = sequenceLength; string seqType = tokens[2]; if (seqType != "bp" && seqType != "aa") { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } // Determine format version and parse the remaining fields by position. string strandType; string strandTopology; string division; string rawDate; string molType = string.Empty; if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa")) { // older format strandType = bioReader.GetLineField(34, 36).Trim(); strandTopology = bioReader.GetLineField(43, 52).Trim(); division = bioReader.GetLineField(53, 56).Trim(); rawDate = bioReader.GetLineField(63).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(37, 42).Trim(); } } else { // newer format strandType = bioReader.GetLineField(45, 47).Trim(); strandTopology = bioReader.GetLineField(56, 63).Trim(); division = bioReader.GetLineField(65, 67).Trim(); rawDate = bioReader.GetLineField(69).Trim(); // molecule type field is not used for amino acid chains if (seqType != "aa") { molType = bioReader.GetLineField(48, 53).Trim(); } } // process strand type if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.Strand = Helper.GetStrandType(strandType); // process strand topology if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular")) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidStrand, strandTopology); Trace.Report(message); throw new InvalidDataException(message); } locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology); // process division try { locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division); } catch (ArgumentException) { locusInfo.DivisionCode = SequenceDivisionCode.None; } // process date DateTime date; if (!DateTime.TryParse(rawDate, out date)) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidDate, rawDate); Trace.Report(message); throw new FormatException(message); } locusInfo.Date = date; locusInfo.SequenceType = seqType; // process sequence type and molecule type MoleculeType moleculeType; if (seqType == "aa") { moleculeType = MoleculeType.Protein; } else { moleculeType = GetMoleculeType(molType); if (moleculeType == MoleculeType.Invalid) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.ParserInvalidLocus, bioReader.Line); Trace.Report(message); throw new FormatException(message); } } IAlphabet alphabet = GetAlphabet(moleculeType); if (alphabet != sequence.Alphabet) { if (Alphabet != null && Alphabet != alphabet) { string message = Properties.Resource.ParserIncorrectAlphabet; Trace.Report(message); throw new InvalidDataException(message); } sequence = new Sequence(alphabet, Encoding, sequence); sequence.IsReadOnly = false; } sequence.MoleculeType = moleculeType; locusInfo.MoleculeType = moleculeType; GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]; metadata.Locus = locusInfo; bioReader.GoToNextLine(); }
/// <summary> /// Parses a single FASTA text from a reader into a sequence. /// </summary> /// <param name="bioReader">bio text reader</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new Sequence instance containing parsed data.</returns> protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = null; if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message; if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.InvariantCulture, Resource.INVAILD_INPUT_FILE, Resource.FASTA_NAME); Trace.Report(message); throw new FileFormatException(message); } // Process header line. Sequence sequence; string id = bioReader.GetLineField(2).Trim(); if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize) { _lineCount++; _lineLength += bioReader.Line.Length; sequencePointer = new SequencePointer { StartingLine = _lineCount }; } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } } if (Encoding == null) { sequence = new Sequence(alphabet); } else { sequence = new Sequence(alphabet, Encoding, string.Empty) { IsReadOnly = false }; } bool sameSequence = false; sequence.ID = id; while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { if (Alphabet == null) { alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line); if (alphabet == null) { message = string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, bioReader.Line); Trace.Report(message); throw new FileFormatException(message); } if (sequence.Alphabet != alphabet) { Sequence seq = new Sequence(alphabet, Encoding, sequence) { IsReadOnly = false }; sequence.Clear(); sequence = seq; } } // full load if (_blockSize <= 0) { sequence.InsertRange(sequence.Count, bioReader.Line); } else { if (sameSequence == false) { _sequenceBeginsAt = _lineLength; sameSequence = true; } _lineLength += bioReader.Line.Length; _lineCount++; } bioReader.GoToNextLine(); } if (sequence.MoleculeType == MoleculeType.Invalid) { sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet); } sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } if (sequencePointer != null) { sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; sequencePointer.StartingIndex = _sequenceBeginsAt; sequencePointer.EndingIndex = _lineLength; _sequencePointers.Add(sequencePointer); } _sequenceCount++; FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualSequenceProvider = dataprovider; return(sequence); }
/// <summary> /// Parses a single FASTQ text from a reader into a QualitativeSequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not. /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new QualitativeSequence instance containing parsed data.</returns> private IQualitativeSequence ParseOneWithFastQFormat(BioTextReader bioReader, bool isReadOnly) { SequencePointer sequencePointer = new SequencePointer(); string message = string.Empty; // Check for '@' symbol at the first line. if (!bioReader.HasLines || !bioReader.Line.StartsWith("@", StringComparison.Ordinal)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); Trace.Report(message); throw new FileFormatException(message); } // Process header line. string id = bioReader.GetLineField(2).Trim(); _numberOfCharactersParsed += bioReader.Line.Length; sequencePointer.StartingIndex = _numberOfCharactersParsed; sequencePointer.StartingLine = bioReader.LineNumber; // Go to second line. bioReader.GoToNextLine(); if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Get sequence from second line. string sequenceLine = bioReader.Line; _numberOfCharactersParsed += bioReader.Line.Length; sequencePointer.EndingIndex = _numberOfCharactersParsed; // Goto third line. bioReader.GoToNextLine(); // Check for '+' symbol in the third line. if (!bioReader.HasLines || !bioReader.Line.StartsWith("+", StringComparison.Ordinal)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } _numberOfCharactersParsed += bioReader.Line.Length; string qualScoreId = bioReader.GetLineField(2).Trim(); if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } // Goto fourth line. bioReader.GoToNextLine(); if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line)) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } _numberOfCharactersParsed += bioReader.Line.Length; // Get the quality scores from the fourth line. byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(bioReader.Line); // Check for sequence length and quality score length. if (sequenceLine.Length != bioReader.Line.Length) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } bioReader.GoToNextLine(); IAlphabet alphabet = Alphabet; // Identify alphabet if it is not specified. if (alphabet == null) { alphabet = IdentifyAlphabet(alphabet, sequenceLine); if (alphabet == null) { string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine); message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1); Trace.Report(message); throw new FileFormatException(message); } } FastQFormatType fastQType = FastqType; // Identify fastq format type if AutoDetectFastQFormat property is set to true. if (AutoDetectFastQFormat) { fastQType = IdentifyFastQFormatType(qualScores); } QualitativeSequence sequence = null; if (Encoding == null) { sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores); } else { sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores); } sequence.ID = id; sequence.IsReadOnly = isReadOnly; // full load if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize) { return(sequence); } sequencePointer.AlphabetName = sequence.Alphabet.Name; sequencePointer.Id = sequence.ID; _sequencePointers.Add(sequencePointer); FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer) { BlockSize = _blockSize, MaxNumberOfBlocks = _maxNumberOfBlocks }; sequence.VirtualQualitativeSequenceProvider = dataProvider; return(sequence); }
/// <summary> /// Parse the Sequence data in the block /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="IDs">List of sequence IDs</param> /// <returns>parse sequence in alignment</returns> private static Dictionary <string, string> ParseCharacterBlock(BioTextReader bioReader, IList <string> IDs) { bool isInCharactersBlock = true; string data = string.Empty; int sequenceLength = 0; Dictionary <string, string> dataSet = new Dictionary <string, string>(); while (bioReader.HasLines && isInCharactersBlock) { bioReader.GoToNextLine(); IList <string> tokens = GetTokens(bioReader.Line); if (0 == string.Compare("DIMENSIONS", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // Parse dimensions // 1. Length of sequence do { foreach (string token in tokens) { data = token.Trim(new char[] { ';' }); if (string.IsNullOrEmpty(data)) { continue; } if (data.StartsWith("nchar=", StringComparison.OrdinalIgnoreCase)) { sequenceLength = Int32.Parse(data.Substring(6), CultureInfo.InvariantCulture); } } if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); } else if (0 == string.Compare("FORMAT", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // Parse format // 1. Notation for "missing" // 2. Notation for "gap" // 3. Notation for "matchchar" // 4. data type do { if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase)) { break; } else { bioReader.GoToNextLine(); tokens = GetTokens(bioReader.Line); } }while (bioReader.HasLines); } if (0 == string.Compare("MATRIX", tokens[0], StringComparison.OrdinalIgnoreCase)) { tokens[0] = string.Empty; // "If available" ignore the data in square brackets [] while (bioReader.HasLines) { if (bioReader.Line.StartsWith("[", StringComparison.OrdinalIgnoreCase)) { bioReader.GoToNextLine(); } else { break; } } // Here are the alignment sequences while (bioReader.HasLines) { bioReader.GoToNextLine(); if (string.IsNullOrEmpty(bioReader.Line.Trim())) { continue; } tokens = GetTokens(bioReader.Line); if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase)) { isInCharactersBlock = false; break; } if (IDs.Contains(tokens[0])) { data = tokens[1]; if (dataSet.ContainsKey(tokens[0])) { data = string.Concat(dataSet[tokens[0]], data); } dataSet[tokens[0]] = data; } } } else if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase)) { isInCharactersBlock = false; } } // Read the end line "end;" bioReader.GoToNextLine(); // Validate the length of sequence foreach (string dataSequence in dataSet.Values) { if (dataSequence.Length != sequenceLength) { throw new FormatException(Properties.Resource.SequenceLengthMismatch); } } return(dataSet); }
/// <summary> /// Parses a single ClustalW text from a reader into a sequence. /// </summary> /// <param name="bioReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false.</param> /// <returns>A new Sequence Alignment instance containing parsed data.</returns> protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly) { if (bioReader == null) { throw new ArgumentNullException("bioReader"); } string message = string.Empty; if (!bioReader.Line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase)) { message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name); throw new InvalidDataException(message); } bioReader.GoToNextLine(); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. bioReader.SkipBlankLines = false; Dictionary <string, ISequence> mapIdToSequence = new Dictionary <string, ISequence>(); IAlphabet alignmentAlphabet = null; bool isFirstBlock = true; bool inBlock = false; while (bioReader.HasLines) { // Blank line or consensus line signals end of block. if (String.IsNullOrEmpty(bioReader.Line) || Helper.ContainsOnly(bioReader.Line, '*', ' ', '.', '+', ':')) { if (inBlock) { // Blank line signifies end of block inBlock = false; isFirstBlock = false; } } else // It's not a blank or consensus line. { // It's a data line in a block. // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore string[] tokens = bioReader.Line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters string id = tokens[0]; string data = tokens[1].ToUpper(CultureInfo.InvariantCulture); Sequence sequence = null; IAlphabet alphabet = Alphabet; inBlock = true; if (isFirstBlock) { if (null == alphabet) { alphabet = _basicParser.IdentifyAlphabet(alphabet, data); if (null == alphabet) { message = string.Format( CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, data); throw new InvalidDataException(message); } else { if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { message = string.Format( CultureInfo.CurrentCulture, Properties.Resource.SequenceAlphabetMismatch); throw new InvalidDataException(message); } } } } if (Encoding == null) { sequence = new Sequence(alphabet, data); } else { sequence = new Sequence(alphabet, Encoding, data); } sequence.ID = id; sequence.IsReadOnly = false; mapIdToSequence.Add(id, sequence); } else { if (!mapIdToSequence.ContainsKey(id)) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id); throw new InvalidDataException(message); } sequence = (Sequence)mapIdToSequence[id]; sequence.InsertRange(sequence.Count, data); } } bioReader.GoToNextLine(); } SequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(new AlignedSequence()); foreach (Sequence alignmentSequence in mapIdToSequence.Values) { alignmentSequence.IsReadOnly = isReadOnly; sequenceAlignment.AlignedSequences[0].Sequences.Add(alignmentSequence); } return(sequenceAlignment); }