示例#1
0
        /// <summary>
        /// Parse Nexus Header
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        private void ParseHeader(MBFTextReader mbfReader)
        {
            string message = string.Empty;

            if (!mbfReader.Line.StartsWith("#NEXUS", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            mbfReader.GoToNextLine();  // Skip blank lines until we get to the first block.

            // Title of Alignment
            if (mbfReader.Line.Trim().StartsWith("[", StringComparison.OrdinalIgnoreCase))
            {
                while (mbfReader.HasLines)
                {
                    mbfReader.GoToNextLine();
                    if (mbfReader.Line.Trim().EndsWith("]", StringComparison.OrdinalIgnoreCase))
                    {
                        break;
                    }
                }
            }

            mbfReader.GoToNextLine();

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            mbfReader.SkipBlankLines = false;
        }
示例#2
0
        private void ParseOrigin(MBFTextReader mbfReader, GenBankMetadata metadata, ref Sequence sequence)
        {
            // The origin line can contain optional data; don't put empty string into
            // metadata.
            if (!String.IsNullOrEmpty(mbfReader.LineData))
            {
                metadata.Origin = mbfReader.LineData;
            }
            mbfReader.GoToNextLine();
            IAlphabet alphabet = null;

            var sequenceBuilder = new StringBuilder();

            while (mbfReader.HasLines && mbfReader.Line[0] == ' ')
            {
                // Using a regex is too slow.
                int len = mbfReader.Line.Length;
                int k   = 10;
                while (k < len)
                {
                    string seqData = mbfReader.Line.Substring(k, Math.Min(10, len - k));

                    sequenceBuilder.Append(seqData);
                    k += 11;
                }

                mbfReader.GoToNextLine();
            }

            var sequenceString = sequenceBuilder.ToString().Trim();

            if (!string.IsNullOrEmpty(sequenceString))
            {
                if (Alphabet == null)
                {
                    alphabet = IdentifyAlphabet(alphabet, sequenceString);

                    if (alphabet == null)
                    {
                        var message = String.Format(Resource.InvalidSymbolInString, mbfReader.Line);
                        Trace.Report(message);
                        throw new Exception(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            MoleculeType = sequence.MoleculeType,
                            IsReadOnly   = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }

                sequence.InsertRange(sequence.Count, sequenceString);
            }
        }
示例#3
0
        public void TestMBFTextReaderCoreFunctionality()
        {
            using (MBFTextReader mbfReader = new MBFTextReader(testFileFullName))
            {
                // Test line access members.
                Assert.IsTrue(mbfReader.HasLines);
                Assert.AreEqual("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
                                mbfReader.Line);
                Assert.IsTrue(mbfReader.LineHasHeader);
                Assert.AreEqual("LOCUS", mbfReader.LineHeader);
                Assert.IsTrue(mbfReader.LineHasData);
                Assert.AreEqual("SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
                                mbfReader.LineData);
                Assert.AreEqual("NA  ", mbfReader.GetLineField(38, 41));

                // Test reading lines and line number tracking.
                for (int i = 1; i < 6; i++)
                {
                    mbfReader.GoToNextLine();
                }
                Assert.AreEqual(7, mbfReader.LineNumber);
                Assert.AreEqual("KEYWORDS", mbfReader.LineHeader);

                // Test switching line indent.
                mbfReader.DataIndent = 2;
                Assert.AreEqual("KE", mbfReader.LineHeader);
                Assert.AreEqual("YWORDS    .", mbfReader.LineData);

                // Test recognition of blank header and data.
                for (int i = 6; i < 8; i++)
                {
                    mbfReader.GoToNextLine();
                }
                Assert.IsFalse(mbfReader.LineHasHeader); // line starts with 2 spaces
                Assert.IsTrue(mbfReader.LineHasData);
                mbfReader.DataIndent = 37;               // the line length
                Assert.IsTrue(mbfReader.LineHasHeader);
                Assert.IsFalse(mbfReader.LineHasData);
                mbfReader.DataIndent = 12; // back to standard line length

                // Test skipping sections and EOF recognition.
                mbfReader.SkipToNextSection(); // ref 1
                mbfReader.SkipToNextSection(); // ref 2
                mbfReader.SkipToNextSection(); // features
                mbfReader.SkipToNextSection(); // origin
                mbfReader.SkipToNextSection(); // "//"
                Assert.IsTrue(mbfReader.HasLines);
                mbfReader.GoToNextLine();      // EOF
                Assert.IsTrue(mbfReader.HasLines);
            }
        }
示例#4
0
        // Handle optional BASE COUNT, then ORIGIN and sequence data.
        private void ParseSequence(MBFTextReader mbfReader, ref Sequence sequence)
        {
            string message = string.Empty;

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            // set data indent for sequence headers
            mbfReader.DataIndent = _dataIndent;

            while (mbfReader.HasLines)
            {
                if (mbfReader.Line.StartsWith("//", StringComparison.Ordinal))
                {
                    mbfReader.GoToNextLine();
                    break; // end of sequence record
                }

                switch (mbfReader.LineHeader)
                {
                case "BASE COUNT":
                    // The BASE COUNT linetype is obsolete and was removed
                    // from the GenBank flatfile format in October 2003.  But if it is
                    // present, we will use it.  We get the untrimmed version since it
                    // starts with a right justified column.
                    metadata.BaseCount = mbfReader.Line.Substring(_dataIndent);
                    mbfReader.GoToNextLine();
                    break;

                case "ORIGIN":
                    // Change Note: The original implementation would validate the alphabet every line
                    // which would greatly impact performance on large sequences.  This updates the method
                    // to improve performance by validating the alphabet after parsing the sequence.
                    ParseOrigin(mbfReader, metadata, ref sequence);
                    break;

                case "CONTIG":
                    metadata.Contig = ParseMultiLineData(mbfReader, Environment.NewLine);
                    // don't go to next line; current line still needs to be processed
                    break;

                default:
                    message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserUnexpectedLineInSequence,
                        mbfReader.Line);
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
            }
        }
示例#5
0
        // returns a string of the data for a header block that spans multiple lines
        private static string ParseMultiLineData(MBFTextReader mbfReader, string lineBreakSubstitution)
        {
            string data = mbfReader.LineData;

            mbfReader.GoToNextLine();

            // while succeeding lines start with no header, add to data
            while (mbfReader.HasLines && !mbfReader.LineHasHeader)
            {
                data += lineBreakSubstitution + mbfReader.LineData;
                mbfReader.GoToNextLine();
            }

            return(data);
        }
示例#6
0
        /// <summary>
        /// Parses a list of sequences using a MBFTextReader.
        /// </summary>
        /// <remarks>
        /// This method should be overridden by any parsers that need to process file-scope
        /// metadata that applies to all of the sequences in the file.
        /// </remarks>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequences should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequences's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The list of parsed ISequence objects.</returns>
        protected virtual IList <ISequenceAlignment> Parse(MBFTextReader mbfReader, bool isReadOnly)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            // no empty files allowed
            if (!mbfReader.HasLines)
            {
                string message = Properties.Resource.IONoTextToParse;
                throw new InvalidDataException(message);
            }

            List <ISequenceAlignment> alignments = new List <ISequenceAlignment>();

            // Parse Header, Loop through the blocks and parse
            while (mbfReader.HasLines)
            {
                if (string.IsNullOrEmpty(mbfReader.Line.Trim()))
                {
                    mbfReader.GoToNextLine();
                    continue;
                }

                alignments.Add(ParseOneWithSpecificFormat(mbfReader, isReadOnly));
            }

            return(alignments);
        }
示例#7
0
        /// <summary>
        /// Parses the GenBank LOCUS using a token based approach which provides more flexibility for
        /// GenBank documents that do not follow the standard 100%.
        /// </summary>
        /// <param name="mbfReader"></param>
        /// <param name="sequence"></param>
        private void ParseLocusByTokens(MBFTextReader mbfReader, ref Sequence sequence)
        {
            var       locusInfo = new GenBankLocusTokenParser().Parse(mbfReader.LineData);
            IAlphabet alphabet  = GetAlphabet(locusInfo.MoleculeType);

            if (alphabet != sequence.Alphabet)
            {
                if (Alphabet != null && Alphabet != alphabet)
                {
                    Trace.Report(Resource.ParserIncorrectAlphabet);
                    throw new InvalidDataException(Resource.ParserIncorrectAlphabet);
                }
                sequence = new Sequence(alphabet, Encoding, sequence)
                {
                    IsReadOnly = false
                };
            }

            sequence.ID           = locusInfo.Name;
            sequence.MoleculeType = locusInfo.MoleculeType;
            var metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Locus = locusInfo;
            mbfReader.GoToNextLine();
        }
示例#8
0
 /// <summary>
 /// Parses all the sequences in a SAM file.
 /// </summary>
 /// <param name="seqAlignment">SequenceAlignmentMap object</param>
 /// <param name="mbfReader">A reader for the sequence alignment text.</param>
 /// <param name="isReadOnly">
 /// Flag to indicate whether the sequences in the resulting sequence alignment should be in readonly mode or not.
 /// If this flag is set to true then the resulting sequences's isReadOnly property 
 /// will be set to true, otherwise it will be set to false.
 /// </param>
 private void ParseSequences(SequenceAlignmentMap seqAlignment, MBFTextReader mbfReader, bool isReadOnly)
 {
     while (mbfReader.HasLines && !mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
     {
         SAMAlignedSequence alignedSeq = ParseSequence(mbfReader, isReadOnly);
         seqAlignment.QuerySequences.Add(alignedSeq);
         mbfReader.GoToNextLine();
     }
 }
示例#9
0
        /// <summary>
        /// Parses SAM alignment header from specified MBFTextReader.
        /// </summary>
        /// <param name="mbfReader">MBF text reader.</param>
        public static SAMAlignmentHeader ParseSAMHeader(MBFTextReader mbfReader)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            _headerLength = 0;
            SAMAlignmentHeader samHeader = new SAMAlignmentHeader();

            if (mbfReader.HasLines && mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
            {
                while (mbfReader.HasLines && mbfReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
                {
                    _headerLength += mbfReader.Line.Length;
                    string[] tokens         = mbfReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries);
                    string   recordTypecode = tokens[0].Substring(1);
                    // Validate the header format.
                    ValidateHeaderLineFormat(mbfReader.Line);

                    SAMRecordField headerLine = null;
                    if (string.Compare(recordTypecode, "CO", StringComparison.OrdinalIgnoreCase) != 0)
                    {
                        List <string> tags = new List <string>();
                        headerLine = new SAMRecordField(recordTypecode);
                        for (int i = 1; i < tokens.Length; i++)
                        {
                            string tagToken = tokens[i];
                            string tagName  = tagToken.Substring(0, 2);
                            tags.Add(tagName);
                            headerLine.Tags.Add(new SAMRecordFieldTag(tagName, tagToken.Substring(3)));
                        }

                        samHeader.RecordFields.Add(headerLine);
                    }
                    else
                    {
                        samHeader.Comments.Add(mbfReader.Line.Substring(4));
                    }

                    mbfReader.GoToNextLine();
                }

                string message = samHeader.IsValid();
                if (!string.IsNullOrEmpty(message))
                {
                    throw new FormatException(message);
                }
            }

            return(samHeader);
        }
示例#10
0
        /// <summary>
        /// Read XML BLAST data from the reader, and build one or more
        /// BlastRecordGroup objects (each containing one or more
        /// BlastSearchRecord results).
        /// </summary>
        /// <param name="reader">The text source</param>
        /// <returns>A list of BLAST iteration objects</returns>
        public IList <BlastResult> Parse(TextReader reader)
        {
            List <BlastResult> records = new List <BlastResult>();
            StringBuilder      sb      = new StringBuilder();

            using (MBFTextReader mbfReader = new MBFTextReader(reader))
            {
                mbfReader.SkipBlankLines = false;
                while (mbfReader.HasLines)
                {
                    if (mbfReader.Line.StartsWith("RPS-BLAST", StringComparison.OrdinalIgnoreCase))
                    {
                        mbfReader.GoToNextLine();
                        continue;
                    }
                    if (mbfReader.Line.StartsWith("<?xml version", StringComparison.OrdinalIgnoreCase) &&
                        mbfReader.LineNumber > 1)
                    {
                        records.Add(ParseXML(sb));
                        sb = new StringBuilder();
                    }
                    sb.AppendLine(mbfReader.Line);
                    mbfReader.GoToNextLine();
                }
            }
            if (sb.Length > 0)
            {
                records.Add(ParseXML(sb));
            }
            if (records.Count == 0)
            {
                string message = Properties.Resource.BlastNoRecords;
                Trace.Report(message);
                throw new FormatException(message);
            }
            return(records);
        }
示例#11
0
        /// <summary>
        /// Parses SequenceAlignmentMap using a MBFTextReader.
        /// </summary>
        /// <param name="mbfReader">A reader for a sequence alignment text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether sequences in the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequences's isReadOnly property 
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The list of parsed ISequenceAlignment objects.</returns>
        private SequenceAlignmentMap Parse(MBFTextReader mbfReader, bool isReadOnly)
        {
            _fileName = mbfReader.FileName;

            // Parse Header, Loop through the blocks and parse
            while (mbfReader.HasLines)
            {
                if (string.IsNullOrEmpty(mbfReader.Line.Trim()))
                {
                    mbfReader.GoToNextLine();
                    continue;
                }

                return ParseOneWithSpecificFormat(mbfReader, isReadOnly);
            }

            return null;
        }
示例#12
0
        /// <summary>
        /// Gets Aligned seqeunces in the Specified SAM file.
        /// </summary>
        /// <param name="textReader">SAM file stream.</param>
        private IEnumerable <SAMAlignedSequence> GetAlignedSequence(MBFTextReader textReader)
        {
            bool isFilterRequired = IsFilterApplied();
            bool display          = true;

            //Displays SAM as output.

            while (textReader.HasLines)
            {
                SAMAlignedSequence alignedSequence = SAMParser.ParseSequence(textReader, false);
                if (isFilterRequired)
                {
                    display = Filter(alignedSequence);
                }

                if (display)
                {
                    yield return(alignedSequence);
                }

                textReader.GoToNextLine();
            }
        }
示例#13
0
        // Parses the consecutive feature lines for one sequence.
        private void ParseFeatures(MBFTextReader mbfReader)
        {
            // The non-comment lines contain features, which are each stored as MetadataListItems.
            // The fields of each feature are referred to as sub-items.  For GFF, these have
            // unique keys, but for compatability with our internal representation of features from
            // GenBank format, each sub-item is a list of strings, rather than a simple string.
            List <MetadataListItem <List <string> > > featureList = null;

            Sequence specificSeq = null;

            while (mbfReader.HasLines)
            {
                if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    // ignore comments
                    mbfReader.GoToNextLine();
                }
                else
                {
                    // fields are tab-delimited
                    string[] featureFields = mbfReader.Line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                    if (featureFields.Length < _minFieldsPerFeature ||
                        featureFields.Length > _maxFieldsPerFeature)
                    {
                        string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);;
                        throw new InvalidDataException(message);
                    }

                    // The featureFields array should now contain the following fields:
                    //      featureFields[0]: sequence name
                    //      featureFields[1]: source
                    //      featureFields[2]: feature name
                    //      featureFields[3]: start
                    //      featureFields[4]: end
                    //      featureFields[5]: score
                    //      featureFields[6]: strand
                    //      featureFields[7]: frame
                    //      featureFields[8]: attributes (optional)

                    // Process sequence name.
                    if (specificSeq == null)
                    {
                        specificSeq = GetSpecificSequence(featureFields[0], MoleculeType.Invalid, mbfReader);

                        // Retrieve features list, or add empty features list to metadata if this
                        // is the first feature.
                        if (specificSeq.Metadata.ContainsKey("features"))
                        {
                            featureList = specificSeq.Metadata["features"] as
                                          List <MetadataListItem <List <string> > >;
                        }
                        else
                        {
                            featureList = new List <MetadataListItem <List <string> > >();
                            specificSeq.Metadata["features"] = featureList;
                        }
                    }
                    else if (specificSeq.DisplayID != featureFields[0])
                    {
                        // don't go to next line; current line still needs to be processed
                        break;
                    }

                    // use feature name as key; attributes field is stored as free text
                    string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty);
                    MetadataListItem <List <string> > feature = new MetadataListItem <List <string> >(featureFields[2], attributes);

                    // source
                    feature.SubItems.Add(_sourceKey, new List <string> {
                        featureFields[1]
                    });

                    // start is an int
                    int ignoreMe;
                    if (!int.TryParse(featureFields[3], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.GffInvalidField,
                            "start",
                            featureFields[3]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("start", new List <string> {
                        featureFields[3]
                    });

                    // end is an int
                    if (!int.TryParse(featureFields[4], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.GffInvalidField,
                            "end",
                            featureFields[4]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    feature.SubItems.Add("end", new List <string> {
                        featureFields[4]
                    });

                    // source is a double, or a dot as a space holder
                    if (featureFields[5] != ".")
                    {
                        double ignoreMeToo;
                        if (!double.TryParse(featureFields[5], out ignoreMeToo))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "score",
                                featureFields[5]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("score", new List <string> {
                            featureFields[5]
                        });
                    }

                    // strand is + or -, or a dot as a space holder
                    if (featureFields[6] != ".")
                    {
                        if (featureFields[6] != "+" && featureFields[6] != "-")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "strand",
                                featureFields[6]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("strand", new List <string> {
                            featureFields[6]
                        });
                    }

                    // frame is an int, or a dot as a space holder
                    if (featureFields[7] != ".")
                    {
                        if (!int.TryParse(featureFields[7], out ignoreMe))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "frame",
                                featureFields[7]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }

                        feature.SubItems.Add("frame", new List <string> {
                            featureFields[7]
                        });
                    }

                    // done with that one
                    featureList.Add(feature);
                    mbfReader.GoToNextLine();
                }
            }

            // A feature file with no features?  May it never be.
            if (featureList == null)
            {
                string message = Properties.Resource.GFFNoFeatures;
                Trace.Report(message);
                throw new InvalidOperationException(message);
            }

            // if any seqs are left in _sequencesInHeader add it to _sequences
            if (_sequencesInHeader.Count > 0)
            {
                _sequences.AddRange(_sequencesInHeader);
                _sequencesInHeader.Clear();
            }
        }
示例#14
0
        // Processes headers, which are a type of comment.
        private void ParseHeaders(MBFTextReader mbfReader)
        {
            string comments      = string.Empty;
            int    commentsCount = 1;

            while (mbfReader.HasLines && mbfReader.Line.TrimStart().StartsWith(_commentMark, StringComparison.Ordinal))
            {
                Sequence specificSeq = null;

                // process headers, but ignore other comments
                if (mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    string[] fields = mbfReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    // Add if any comments.
                    if (!string.IsNullOrEmpty(comments))
                    {
                        _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments;
                        comments = string.Empty;
                        commentsCount++;
                    }

                    switch (fields[0].ToUpperInvariant())
                    {
                    case _gffVersionKey:
                        if (fields.Length > 1 && fields[1] != "2")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffUnsupportedVersion,
                                mbfReader.LocationString);
                            Trace.Report(message);
                            throw new NotSupportedException(message);
                        }

                        // Store "GFF-VERSION" to get keep the order of comments/headers.
                        _commonSeq.Metadata[_gffVersionKey] = fields[1];

                        break;

                    case _sourceVersionKey:

                        MetadataListItem <string> sourceVersion = new MetadataListItem <string>(_sourceVersionKey, string.Empty);
                        sourceVersion.SubItems.Add(_sourceKey, fields[1]);
                        sourceVersion.SubItems.Add(_versionKey, fields[2]);

                        _commonSeq.Metadata[_sourceVersionKey] = sourceVersion;

                        break;

                    case _dateKey:
                        DateTime date;
                        if (!DateTime.TryParse(fields[1], out date))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.ParserInvalidDate,
                                mbfReader.LocationString);
                            Trace.Report(message);
                            throw new FormatException(message);
                        }

                        _commonSeq.Metadata[_dateLowerCaseKey] = date;
                        break;

                    case _typeKey:
                        if (fields.Length == 2)
                        {
                            _commonSeq.MoleculeType = GetMoleculeType(fields[1]);
                            if (_commonSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            _commonSeq.Metadata[_typeKey] = fields[1];
                        }
                        else
                        {
                            specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), mbfReader, false);

                            if (specificSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            // Store "TYPE" to get keep the order of comments/headers.
                            // Store seq id as value.
                            _commonSeq.Metadata[_multiTypeKey + fields[2]] = fields[2];
                        }
                        break;

                    case "DNA":
                    case "RNA":
                    case "PROTEIN":
                        specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), mbfReader, false);
                        mbfReader.GoToNextLine();

                        // Store seq id as value.
                        _commonSeq.Metadata[_multiSeqDataKey + fields[1]] = fields[1];

                        while (mbfReader.HasLines && mbfReader.Line != _seqDataEnd + fields[0])
                        {
                            if (!mbfReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GffInvalidSequence,
                                    mbfReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            specificSeq.InsertRange(specificSeq.Count, mbfReader.GetLineField(3));

                            mbfReader.GoToNextLine();
                        }

                        break;

                    case _seqRegKey:

                        specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, mbfReader, false);
                        specificSeq.Metadata["start"] = fields[2];
                        specificSeq.Metadata["end"]   = fields[3];

                        // Store seq id as value.
                        _commonSeq.Metadata[_multiSeqRegKey + fields[1]] = fields[1];
                        break;
                    }
                }
                else
                {
                    comments = string.IsNullOrEmpty(comments) ? mbfReader.Line : comments + Environment.NewLine + mbfReader.Line;
                }

                mbfReader.GoToNextLine();
            }

            if (!string.IsNullOrEmpty(comments))
            {
                _commonSeq.Metadata[_commentSectionKey + commentsCount.ToString(CultureInfo.InvariantCulture)] = comments;
                comments = string.Empty;
            }
        }
示例#15
0
        private void ParseFeatures(MBFTextReader mbfReader, ref Sequence sequence)
        {
            ILocationBuilder locBuilder = LocationBuilder;

            if (locBuilder == null)
            {
                throw new InvalidOperationException(Resource.NullLocationBuild);
            }

            // set data indent for features
            mbfReader.DataIndent = _featureDataIndent;

            // The sub-items of a feature are referred to as qualifiers.  These do not have unique
            // keys, so they are stored as lists in the SubItems dictionary.
            SequenceFeatures    features    = new SequenceFeatures();
            IList <FeatureItem> featureList = features.All;

            while (mbfReader.HasLines)
            {
                if (String.IsNullOrEmpty(mbfReader.Line) || mbfReader.LineHeader == "FEATURES")
                {
                    mbfReader.GoToNextLine();
                    continue;
                }

                if (mbfReader.Line[0] != ' ')
                {
                    // start of non-feature text
                    break;
                }

                if (!mbfReader.LineHasHeader)
                {
                    string message = Properties.Resource.GenbankEmptyFeature;
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }

                // check for multi-line location string
                string featureKey = mbfReader.LineHeader;
                string location   = mbfReader.LineData;
                mbfReader.GoToNextLine();
                while (mbfReader.HasLines && !mbfReader.LineHasHeader &&
                       mbfReader.LineHasData && !mbfReader.LineData.StartsWith("/", StringComparison.Ordinal))
                {
                    location += mbfReader.LineData;
                    mbfReader.GoToNextLine();
                }

                // create features as MetadataListItems
                FeatureItem feature = new FeatureItem(featureKey, locBuilder.GetLocation(location));

                // process the list of qualifiers, which are each in the form of
                // /key="value"
                string qualifierKey   = string.Empty;
                string qualifierValue = string.Empty;
                while (mbfReader.HasLines)
                {
                    if (!mbfReader.LineHasHeader && mbfReader.LineHasData)
                    {
                        // '/' denotes a continuation of the previous line
                        if (mbfReader.LineData.StartsWith("/", StringComparison.Ordinal))
                        {
                            // new qualifier; save previous if this isn't the first
                            if (!String.IsNullOrEmpty(qualifierKey))
                            {
                                AddQualifierToFeature(feature, qualifierKey, qualifierValue);
                            }

                            // set the key and value of this qualifier
                            int equalsIndex = mbfReader.LineData.IndexOf('=');
                            if (equalsIndex < 0)
                            {
                                // no value, just key (this is allowed, see NC_005213.gbk)
                                qualifierKey   = mbfReader.LineData.Substring(1);
                                qualifierValue = string.Empty;
                            }
                            else if (equalsIndex > 0)
                            {
                                qualifierKey   = mbfReader.LineData.Substring(1, equalsIndex - 1);
                                qualifierValue = mbfReader.LineData.Substring(equalsIndex + 1);
                            }
                            else
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GenbankInvalidFeature,
                                    mbfReader.Line);
                                Trace.Report(message);
                                throw new InvalidDataException(message);
                            }
                        }
                        else
                        {
                            // Continuation of previous line; "note" gets a line break, and
                            // everything else except "translation" and "transl_except" gets a
                            // space to separate words.
                            if (qualifierKey == "note")
                            {
                                qualifierValue += Environment.NewLine;
                            }
                            else if (qualifierKey != "translation" && qualifierKey != "transl_except")
                            {
                                qualifierValue += " ";
                            }

                            qualifierValue += mbfReader.LineData;
                        }

                        mbfReader.GoToNextLine();
                    }
                    else if (mbfReader.Line.StartsWith("\t", StringComparison.Ordinal))
                    {
                        // this seems to be data corruption; but BioPerl test set includes
                        // (old, 2003) NT_021877.gbk which has this problem, so we
                        // handle it
                        ApplicationLog.WriteLine("WARN: nonstandard line format at line {0}: '{1}'",
                                                 mbfReader.LineNumber, mbfReader.Line);
                        qualifierValue += " " + mbfReader.Line.Trim();
                        mbfReader.GoToNextLine();
                    }
                    else
                    {
                        break;
                    }
                }

                // add last qualifier
                if (!String.IsNullOrEmpty(qualifierKey))
                {
                    AddQualifierToFeature(feature, qualifierKey, qualifierValue);
                }

                // still add feature, even if it has no qualifiers
                featureList.Add(StandardFeatureMap.GetStandardFeatureItem(feature));
            }

            if (featureList.Count > 0)
            {
                ((GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]).Features = features;
            }
        }
示例#16
0
        /// <summary>
        /// Parses a single FASTA sequence from a file using MBFTextReader.
        /// This method is used in non-data virtualization scenarios.
        /// </summary>
        /// <param name="mbfReader">The MBFTextReader of the file to be parsed.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in read-only mode.
        /// If this flag is set to true then the resulting sequence's IsReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The parsed sequence.</returns>
        protected ISequence ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            string message;

            if (!mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVALID_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = mbfReader.GetLineField(2).Trim();

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, mbfReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            mbfReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            sequence.ID = id;
            while (mbfReader.HasLines && !mbfReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, mbfReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                mbfReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }

                sequence.InsertRange(sequence.Count, mbfReader.Line);
                mbfReader.GoToNextLine();
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }

            sequence.IsReadOnly = isReadOnly;
            return(sequence);
        }
示例#17
0
        private static void ParseReferences(MBFTextReader mbfReader, ref Sequence sequence)
        {
            GenBankMetadata           metadata      = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            IList <CitationReference> referenceList = metadata.References;
            CitationReference         reference     = null;

            //List<MetadataListItem<string>> referenceList = new List<MetadataListItem<string>>();
            //MetadataListItem<string> reference = null;

            while (mbfReader.HasLines)
            {
                if (mbfReader.LineHeader == "REFERENCE")
                {
                    // add previous reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // check for start/end e.g. (bases 1 to 118), or prose notes
                    Match m = Regex.Match(mbfReader.LineData,
                                          @"^(?<number>\d+)(\s+\((?<location>.*)\))?");
                    if (!m.Success)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserReferenceError,
                            mbfReader.LineData);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // create new reference
                    string number   = m.Groups["number"].Value;
                    string location = m.Groups["location"].Value;
                    reference = new CitationReference();
                    int outValue;
                    if (!int.TryParse(number, out outValue))
                    {
                        throw new InvalidOperationException();
                    }
                    reference.Number   = outValue;
                    reference.Location = location;
                    mbfReader.GoToNextLine();
                }
                else if (mbfReader.Line.StartsWith(" ", StringComparison.Ordinal))
                {
                    switch (mbfReader.LineHeader)
                    {
                    // all the following are extracted the same way - possibly multiline
                    case "AUTHORS":
                        reference.Authors = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "CONSRTM":
                        reference.Consortiums = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "TITLE":
                        reference.Title = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "JOURNAL":
                        reference.Journal = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "REMARK":
                        reference.Remarks = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "MEDLINE":
                        reference.Medline = ParseMultiLineData(mbfReader, " ");
                        break;

                    case "PUBMED":
                        reference.PubMed = ParseMultiLineData(mbfReader, " ");
                        break;

                    default:
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidReferenceField,
                            mbfReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    // add last reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }
        }
示例#18
0
        private static void ParseSource(MBFTextReader mbfReader, ref Sequence sequence)
        {
            string source      = string.Empty;
            string organism    = string.Empty;
            string classLevels = string.Empty;

            while (mbfReader.HasLines)
            {
                if (mbfReader.LineHeader == "SOURCE")
                {
                    // data can be multiline. spec says last line must end with period
                    // (note: this doesn't apply unless multiline)
                    bool lastDotted = true;
                    source = mbfReader.LineData;

                    mbfReader.GoToNextLine();
                    while (mbfReader.HasLines && !mbfReader.LineHasHeader)
                    {
                        source    += " " + mbfReader.LineData;
                        lastDotted = (source.EndsWith(".", StringComparison.Ordinal));
                        mbfReader.GoToNextLine();
                    }

                    if (!lastDotted && Trace.Want(Trace.SeqWarnings))
                    {
                        Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source);
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else if (mbfReader.Line[0] == ' ')
                {
                    if (mbfReader.LineHeader != "ORGANISM")
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidSourceField,
                            mbfReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // this also can be multiline
                    organism = mbfReader.LineData;

                    mbfReader.GoToNextLine();
                    while (mbfReader.HasLines && !mbfReader.LineHasHeader)
                    {
                        if (mbfReader.Line.EndsWith(";", StringComparison.Ordinal) || mbfReader.Line.EndsWith(".", StringComparison.Ordinal))
                        {
                            if (!String.IsNullOrEmpty(classLevels))
                            {
                                classLevels += " ";
                            }

                            classLevels += mbfReader.LineData;
                        }
                        else
                        {
                            organism += " " + mbfReader.LineData;
                        }
                        mbfReader.GoToNextLine();
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else
                {
                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Source            = new SequenceSource();
            metadata.Source.CommonName = source;
            if (!string.IsNullOrEmpty(organism))
            {
                int index = organism.IndexOf(" ", StringComparison.Ordinal);
                if (index > 0)
                {
                    metadata.Source.Organism.Genus = organism.Substring(0, index);
                    if (organism.Length > index)
                    {
                        index++;
                        metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index);
                    }
                }
                else
                {
                    metadata.Source.Organism.Genus = organism;
                }
            }

            metadata.Source.Organism.ClassLevels = classLevels;
        }
示例#19
0
        /// <summary>
        /// Parses a single Phylip text from a reader into a sequence.
        /// 1. First link has Count of Taxa and length of each sequence
        /// 2. Sequences
        ///     a. First ten character are ID
        ///     b. Sequence itself
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.</param>
        /// <returns>A new Sequence Alignment instance containing parsed data.</returns>
        protected ISequenceAlignment ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            string message = string.Empty;

            // Parse first line
            IList <string> tokens = mbfReader.Line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries);

            if (2 != tokens.Count)
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            bool             isFirstBlock      = true;
            int              sequenceCount     = 0;
            int              sequenceLength    = 0;
            IList <Sequence> data              = new List <Sequence>();
            string           id                = string.Empty;
            string           sequenceString    = string.Empty;
            Sequence         sequence          = null;
            IAlphabet        alignmentAlphabet = null;

            sequenceCount  = Int32.Parse(tokens[0], CultureInfo.InvariantCulture);
            sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture);

            mbfReader.GoToNextLine();  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            mbfReader.SkipBlankLines = false;

            while (mbfReader.HasLines)
            {
                if (string.IsNullOrEmpty(mbfReader.Line.Trim()))
                {
                    mbfReader.GoToNextLine();
                    continue;
                }

                for (int index = 0; index < sequenceCount; index++)
                {
                    if (isFirstBlock)
                    {
                        // First 10 characters are sequence ID, remaining is the first block of sequence
                        // Note that both may contain whitespace, and there may be no whitespace between them.
                        if (mbfReader.Line.Length <= 10)
                        {
                            message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                            throw new Exception(message);
                        }
                        id             = mbfReader.Line.Substring(0, 10).Trim();
                        sequenceString = Util.Helper.StringRemoveWhitespace(mbfReader.Line.Substring(10));

                        IAlphabet alphabet = Alphabet;
                        if (null == alphabet)
                        {
                            alphabet = _basicParser.IdentifyAlphabet(alphabet, sequenceString);

                            if (null == alphabet)
                            {
                                message = string.Format(
                                    CultureInfo.InvariantCulture,
                                    Resource.InvalidSymbolInString,
                                    sequenceString);
                                throw new InvalidDataException(message);
                            }
                            else
                            {
                                if (null == alignmentAlphabet)
                                {
                                    alignmentAlphabet = alphabet;
                                }
                                else
                                {
                                    if (alignmentAlphabet != alphabet)
                                    {
                                        message = Properties.Resource.SequenceAlphabetMismatch;
                                        throw new InvalidDataException(message);
                                    }
                                }
                            }
                        }

                        if (Encoding == null)
                        {
                            sequence = new Sequence(alphabet, sequenceString);
                        }
                        else
                        {
                            sequence = new Sequence(alphabet, Encoding, sequenceString);
                        }

                        sequence.ID         = id;
                        sequence.IsReadOnly = false;
                        data.Add(sequence);
                    }
                    else
                    {
                        sequence = data[index];
                        sequence.InsertRange(sequence.Count, Util.Helper.StringRemoveWhitespace(mbfReader.Line));
                    }

                    mbfReader.GoToNextLine();
                }

                // Reset the first block flag
                isFirstBlock = false;
            }

            // Validate for the count of sequence
            if (sequenceCount != data.Count)
            {
                throw new InvalidDataException(Properties.Resource.SequenceCountMismatch);
            }

            SequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());

            foreach (Sequence dataSequence in data)
            {
                dataSequence.IsReadOnly = isReadOnly;

                // Validate for the count of sequence
                if (sequenceLength != dataSequence.Count)
                {
                    throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch);
                }

                sequenceAlignment.AlignedSequences[0].Sequences.Add(dataSequence);
            }

            return(sequenceAlignment);
        }
示例#20
0
        // parses everything before the features section
        private void ParseHeaders(MBFTextReader mbfReader, ref Sequence sequence)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            string          data     = string.Empty;

            string[] tokens = null;
            // set data indent for headers
            mbfReader.DataIndent = _dataIndent;

            // only allow one locus line
            bool haveParsedLocus = false;

            // parse until we hit the features or sequence section
            bool haveFinishedHeaders = false;

            while (mbfReader.HasLines && !haveFinishedHeaders)
            {
                switch (mbfReader.LineHeader)
                {
                case "LOCUS":
                    if (haveParsedLocus)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserSecondLocus,
                            mbfReader.LocationString);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    ParseLocusByTokens(mbfReader, ref sequence);
                    metadata        = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    haveParsedLocus = true;
                    // don't go to next line; current line still needs to be processed
                    break;

                case "VERSION":
                    tokens = mbfReader.LineData.Split(new char[] { ' ' },
                                                      StringSplitOptions.RemoveEmptyEntries);
                    // first token contains accession and version
                    Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$");
                    metadata.Version = new GenBankVersion();

                    if (m.Success)
                    {
                        metadata.Version.Version = m.Groups["version"].Value;
                        // The first token in the data from the accession line is referred to as
                        // the primary accession number, and should be the one used here in the
                        // version line.
                        string versionLineAccession = m.Groups["accession"].Value;
                        if (metadata.Accession == null)
                        {
                            ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION");
                        }
                        else
                        {
                            if (!versionLineAccession.Equals(metadata.Accession.Primary))
                            {
                                ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION");
                            }
                            else
                            {
                                metadata.Version.Accession = metadata.Accession.Primary;
                            }
                        }
                    }
                    // second token contains primary ID
                    m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)");
                    if (m.Success)
                    {
                        metadata.Version.GINumber = m.Groups["primaryID"].Value;
                    }
                    mbfReader.GoToNextLine();
                    break;

                case "PROJECT":
                    tokens = mbfReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.Project      = new ProjectIdentifier();
                        metadata.Project.Name = tokens[0];
                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.Project.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + mbfReader.Line);
                    }
                    mbfReader.GoToNextLine();
                    break;

                case "SOURCE":
                    ParseSource(mbfReader, ref sequence);
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "REFERENCE":
                    ParseReferences(mbfReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "COMMENT":
                    ParseComments(mbfReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "PRIMARY":
                    // This header is followed by sequence info in a table format that could be
                    // stored in a custom object.  The first line contains column headers.
                    // For now, just validate the presence of the headers, and save the data
                    // as a string.
                    tokens = mbfReader.LineData.Split("\t ".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                    // Validating for minimum two headers.
                    if (tokens.Length != 4)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserPrimaryLineError,
                            mbfReader.Line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    string primaryData = ParseMultiLineData(mbfReader, Environment.NewLine);
                    metadata.Primary = primaryData;
                    // don't go to next line; current line still needs to be processed
                    break;

                // all the following are extracted the same way - possibly multiline
                case "DEFINITION":
                    metadata.Definition = ParseMultiLineData(mbfReader, " ");
                    break;

                case "ACCESSION":
                    data = ParseMultiLineData(mbfReader, " ");
                    metadata.Accession = new GenBankAccession();
                    string[] accessions = data.Split(' ');
                    metadata.Accession.Primary = accessions[0];

                    for (int i = 1; i < accessions.Length; i++)
                    {
                        metadata.Accession.Secondary.Add(accessions[i]);
                    }
                    break;

                case "DBLINK":
                    tokens = mbfReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.DBLink = new CrossReferenceLink();
                        if (string.Compare(tokens[0],
                                           CrossReferenceType.Project.ToString(),
                                           StringComparison.OrdinalIgnoreCase) == 0)
                        {
                            metadata.DBLink.Type = CrossReferenceType.Project;
                        }
                        else
                        {
                            metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive;
                        }

                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.DBLink.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + mbfReader.Line);
                    }
                    mbfReader.GoToNextLine();
                    break;

                case "DBSOURCE":
                    metadata.DBSource = ParseMultiLineData(mbfReader, " ");
                    break;

                case "KEYWORDS":
                    metadata.Keywords = ParseMultiLineData(mbfReader, " ");
                    break;

                case "SEGMENT":
                    data = ParseMultiLineData(mbfReader, " ");
                    string delimeter = "of";
                    tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                    int outvalue;
                    if (tokens.Length == 2)
                    {
                        metadata.Segment = new SequenceSegment();
                        if (int.TryParse(tokens[0].Trim(), out outvalue))
                        {
                            metadata.Segment.Current = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line);
                        }

                        if (int.TryParse(tokens[1].Trim(), out outvalue))
                        {
                            metadata.Segment.Count = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + mbfReader.Line);
                    }
                    break;

                // all the following indicate sections beyond the headers parsed by this method
                case "FEATURES":
                case "BASE COUNT":
                case "ORIGIN":
                case "CONTIG":
                    haveFinishedHeaders = true;
                    break;

                default:
                    ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", mbfReader.LineHeader, mbfReader.LineData);
                    string errMessage = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParseHeaderError,
                        mbfReader.LineHeader);
                    Trace.Report(errMessage);
                    throw new InvalidDataException(errMessage);
                }
            }

            // check for required features
            if (!haveParsedLocus)
            {
                string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
        }
示例#21
0
        /// <summary>
        /// Parse the Sequence data in the block
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="IDs">List of sequence IDs</param>
        /// <returns>parse sequence in alignment</returns>
        private static Dictionary <string, string> ParseCharacterBlock(MBFTextReader mbfReader, IList <string> IDs)
        {
            bool   isInCharactersBlock = true;
            string data           = string.Empty;
            int    sequenceLength = 0;
            Dictionary <string, string> dataSet = new Dictionary <string, string>();

            while (mbfReader.HasLines && isInCharactersBlock)
            {
                mbfReader.GoToNextLine();
                IList <string> tokens = GetTokens(mbfReader.Line);

                if (0 == string.Compare("DIMENSIONS", tokens[0], StringComparison.OrdinalIgnoreCase))
                {
                    tokens[0] = string.Empty;

                    // Parse dimensions
                    // 1. Length of sequence
                    do
                    {
                        foreach (string token in tokens)
                        {
                            data = token.Trim(new char[] { ';' });

                            if (string.IsNullOrEmpty(data))
                            {
                                continue;
                            }

                            if (data.StartsWith("nchar=", StringComparison.OrdinalIgnoreCase))
                            {
                                sequenceLength = Int32.Parse(data.Substring(6), CultureInfo.InvariantCulture);
                            }
                        }

                        if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            mbfReader.GoToNextLine();
                            tokens = GetTokens(mbfReader.Line);
                        }
                    }while (mbfReader.HasLines);
                }
                else if (0 == string.Compare("FORMAT", tokens[0], StringComparison.OrdinalIgnoreCase))
                {
                    tokens[0] = string.Empty;

                    // Parse format
                    // 1. Notation for "missing"
                    // 2. Notation for "gap"
                    // 3. Notation for "matchchar"
                    // 4. data type
                    do
                    {
                        if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            mbfReader.GoToNextLine();
                            tokens = GetTokens(mbfReader.Line);
                        }
                    }while (mbfReader.HasLines);
                }
                if (0 == string.Compare("MATRIX", tokens[0], StringComparison.OrdinalIgnoreCase))
                {
                    tokens[0] = string.Empty;

                    // "If available" ignore the data in square brackets []
                    while (mbfReader.HasLines)
                    {
                        if (mbfReader.Line.StartsWith("[", StringComparison.OrdinalIgnoreCase))
                        {
                            mbfReader.GoToNextLine();
                        }
                        else
                        {
                            break;
                        }
                    }

                    // Here are the alignment sequences
                    while (mbfReader.HasLines)
                    {
                        mbfReader.GoToNextLine();

                        if (string.IsNullOrEmpty(mbfReader.Line.Trim()))
                        {
                            continue;
                        }

                        tokens = GetTokens(mbfReader.Line);
                        if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            isInCharactersBlock = false;
                            break;
                        }

                        if (IDs.Contains(tokens[0]))
                        {
                            data = tokens[1];

                            if (dataSet.ContainsKey(tokens[0]))
                            {
                                data = string.Concat(dataSet[tokens[0]], data);
                            }

                            dataSet[tokens[0]] = data;
                        }
                    }
                }
                else if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase))
                {
                    isInCharactersBlock = false;
                }
            }

            // Read the end line "end;"
            mbfReader.GoToNextLine();

            // Validate the length of sequence
            foreach (string dataSequence in dataSet.Values)
            {
                if (dataSequence.Length != sequenceLength)
                {
                    throw new FormatException(Properties.Resource.SequenceLengthMismatch);
                }
            }

            return(dataSet);
        }
示例#22
0
        /// <summary>
        /// Parses a single ClustalW text from a reader into a sequence.
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.</param>
        /// <returns>A new Sequence Alignment instance containing parsed data.</returns>
        protected ISequenceAlignment ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            string message = string.Empty;

            if (!mbfReader.Line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            mbfReader.GoToNextLine();  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            mbfReader.SkipBlankLines = false;

            Dictionary <string, ISequence> mapIdToSequence = new Dictionary <string, ISequence>();
            IAlphabet alignmentAlphabet = null;
            bool      isFirstBlock      = true;
            bool      inBlock           = false;

            while (mbfReader.HasLines)
            {
                // Blank line or consensus line signals end of block.
                if (String.IsNullOrEmpty(mbfReader.Line) ||
                    Helper.ContainsOnly(mbfReader.Line, '*', ' ', '.', '+', ':'))
                {
                    if (inBlock)
                    {
                        // Blank line signifies end of block
                        inBlock      = false;
                        isFirstBlock = false;
                    }
                }
                else // It's not a blank or consensus line.
                {
                    // It's a data line in a block.
                    // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore
                    string[]  tokens   = mbfReader.Line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters
                    string    id       = tokens[0];
                    string    data     = tokens[1].ToUpper(CultureInfo.InvariantCulture);
                    Sequence  sequence = null;
                    IAlphabet alphabet = Alphabet;

                    inBlock = true;
                    if (isFirstBlock)
                    {
                        if (null == alphabet)
                        {
                            alphabet = _basicParser.IdentifyAlphabet(alphabet, data);

                            if (null == alphabet)
                            {
                                message = string.Format(
                                    CultureInfo.InvariantCulture,
                                    Resource.InvalidSymbolInString,
                                    data);
                                throw new InvalidDataException(message);
                            }
                            else
                            {
                                if (null == alignmentAlphabet)
                                {
                                    alignmentAlphabet = alphabet;
                                }
                                else
                                {
                                    if (alignmentAlphabet != alphabet)
                                    {
                                        message = string.Format(
                                            CultureInfo.CurrentCulture,
                                            Properties.Resource.SequenceAlphabetMismatch);
                                        throw new InvalidDataException(message);
                                    }
                                }
                            }
                        }

                        if (Encoding == null)
                        {
                            sequence = new Sequence(alphabet, data);
                        }
                        else
                        {
                            sequence = new Sequence(alphabet, Encoding, data);
                        }

                        sequence.ID         = id;
                        sequence.IsReadOnly = false;

                        mapIdToSequence.Add(id, sequence);
                    }
                    else
                    {
                        if (!mapIdToSequence.ContainsKey(id))
                        {
                            message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id);
                            throw new InvalidDataException(message);
                        }

                        sequence = (Sequence)mapIdToSequence[id];
                        sequence.InsertRange(sequence.Count, data);
                    }
                }

                mbfReader.GoToNextLine();
            }

            SequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());
            foreach (Sequence alignmentSequence in mapIdToSequence.Values)
            {
                alignmentSequence.IsReadOnly = isReadOnly;
                sequenceAlignment.AlignedSequences[0].Sequences.Add(alignmentSequence);
            }

            return(sequenceAlignment);
        }
示例#23
0
        /// <summary>
        /// Parses a single Nexus text from a reader into a sequence.
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.</param>
        /// <returns>A new Sequence instance containing parsed data.</returns>
        protected ISequenceAlignment ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            if (mbfReader == null)
            {
                throw new ArgumentNullException("mbfReader");
            }

            ParseHeader(mbfReader);

            string             message           = string.Empty;
            ISequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());
            IList <string> ids       = null;
            bool           isInBlock = true;

            if (mbfReader.Line.StartsWith("begin", StringComparison.OrdinalIgnoreCase))
            {
                while (mbfReader.HasLines && isInBlock)
                {
                    if (string.IsNullOrEmpty(mbfReader.Line.Trim()))
                    {
                        mbfReader.GoToNextLine();
                        continue;
                    }

                    string blockName = GetTokens(mbfReader.Line)[1];

                    switch (blockName.ToUpper(CultureInfo.InvariantCulture))
                    {
                    case "TAXA":
                    case "TAXA;":
                        // This block contains the count of sequence & title of each sequence
                        ids = (IList <string>)ParseTaxaBlock(mbfReader);

                        break;

                    case "CHARACTERS":
                    case "CHARACTERS;":
                        // Block contains sequences
                        Dictionary <string, string> dataSet = ParseCharacterBlock(mbfReader, ids);

                        IAlphabet alignmentAlphabet = null;
                        string    data = string.Empty;

                        foreach (string ID in ids)
                        {
                            IAlphabet alphabet = Alphabet;
                            Sequence  sequence = null;
                            data = dataSet[ID];

                            if (null == alphabet)
                            {
                                alphabet = _basicParser.IdentifyAlphabet(alphabet, data);

                                if (null == alphabet)
                                {
                                    message = string.Format(
                                        CultureInfo.InvariantCulture,
                                        Resource.InvalidSymbolInString,
                                        data);
                                    throw new InvalidDataException(message);
                                }
                                else
                                {
                                    if (null == alignmentAlphabet)
                                    {
                                        alignmentAlphabet = alphabet;
                                    }
                                    else
                                    {
                                        if (alignmentAlphabet != alphabet)
                                        {
                                            message = string.Format(
                                                CultureInfo.InvariantCulture,
                                                Properties.Resource.SequenceAlphabetMismatch);
                                            throw new InvalidDataException(message);
                                        }
                                    }
                                }
                            }

                            if (Encoding == null)
                            {
                                sequence = new Sequence(alphabet, data);
                            }
                            else
                            {
                                sequence = new Sequence(alphabet, Encoding, data);
                            }

                            sequence.IsReadOnly = isReadOnly;
                            sequence.ID         = ID;
                            sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence);
                        }

                        break;

                    case "END":
                    case "END;":
                        // Have reached the end of block
                        isInBlock = false;

                        break;

                    default:
                        // skip this block
                        while (mbfReader.HasLines)
                        {
                            mbfReader.GoToNextLine();
                            if (0 == string.Compare(mbfReader.Line, "end;", StringComparison.OrdinalIgnoreCase))
                            {
                                break;
                            }
                        }

                        break;
                    }

                    mbfReader.GoToNextLine();
                }
            }

            return(sequenceAlignment);
        }
示例#24
0
        /// <summary>
        /// Parses a single FASTQ text from a reader into a QualitativeSequence.
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new QualitativeSequence instance containing parsed data.</returns>
        private IQualitativeSequence ParseOneWithFastQFormat(MBFTextReader mbfReader, bool isReadOnly)
        {
            string message;

            // Check for '@' symbol at the first line.
            if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("@", StringComparison.Ordinal))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, Name);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            string id = mbfReader.GetLineField(2).Trim();

            // Go to second line.
            mbfReader.GoToNextLine();
            if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get sequence from second line.
            string sequenceLine = mbfReader.Line;

            // Goto third line.
            mbfReader.GoToNextLine();

            // Check for '+' symbol in the third line.
            if (!mbfReader.HasLines || !mbfReader.Line.StartsWith("+", StringComparison.Ordinal))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            string qualScoreId = mbfReader.GetLineField(2).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Goto fourth line.
            mbfReader.GoToNextLine();
            if (!mbfReader.HasLines || string.IsNullOrEmpty(mbfReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get the quality scores from the fourth line.
            byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(mbfReader.Line);

            // Check for sequence length and quality score length.
            if (sequenceLine.Length != mbfReader.Line.Length)
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            mbfReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            // Identify alphabet if it is not specified.
            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, sequenceLine);

                if (alphabet == null)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            FastQFormatType fastQType = FastqType;

            // Identify fastq format type if AutoDetectFastQFormat property is set to true.
            if (AutoDetectFastQFormat)
            {
                fastQType = IdentifyFastQFormatType(qualScores);
            }

            QualitativeSequence sequence = null;

            if (Encoding == null)
            {
                sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores);
            }
            else
            {
                sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores);
            }

            sequence.ID         = id;
            sequence.IsReadOnly = isReadOnly;

            return(sequence);
        }
示例#25
0
        /// <summary>
        /// Gets the list of sequence titles
        /// </summary>
        /// <param name="mbfReader">A reader for a biological sequence text.</param>
        /// <returns>List of sequence IDs</returns>
        private static IList <string> ParseTaxaBlock(MBFTextReader mbfReader)
        {
            bool           isInTaxaBlock = true;
            string         data          = string.Empty;
            int            sequenceCount = 0;
            IList <string> IDs           = new List <string>();

            while (mbfReader.HasLines && isInTaxaBlock)
            {
                mbfReader.GoToNextLine();
                IList <string> tokens = GetTokens(mbfReader.Line);
                switch (tokens[0].ToUpper(CultureInfo.InvariantCulture))
                {
                case "DIMENSIONS":
                    tokens[0] = string.Empty;

                    // Parse dimensions
                    // 1. Read count of sequence
                    do
                    {
                        foreach (string token in tokens)
                        {
                            data = token.Trim(new char[] { ';' });

                            if (string.IsNullOrEmpty(data))
                            {
                                continue;
                            }

                            if (data.StartsWith("ntax=", StringComparison.OrdinalIgnoreCase))
                            {
                                sequenceCount = Int32.Parse(data.Substring(5), CultureInfo.InvariantCulture);
                            }
                        }

                        if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            mbfReader.GoToNextLine();
                            tokens = GetTokens(mbfReader.Line);
                        }
                    }while (mbfReader.HasLines);

                    break;

                case "TAXLABELS":
                case "TAXLABELS;":
                    tokens[0] = string.Empty;

                    // Parse taxlabels
                    // 1. Read IDs of sequence
                    do
                    {
                        foreach (string token in tokens)
                        {
                            data = token.Trim(new char[] { ';' });

                            if (string.IsNullOrEmpty(data))
                            {
                                continue;
                            }

                            IDs.Add(data);
                        }

                        if (mbfReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            mbfReader.GoToNextLine();
                            tokens = GetTokens(mbfReader.Line);
                        }
                    }while (mbfReader.HasLines);

                    break;

                case "END":
                case "END;":
                    // Have reached the end of taxa block
                    isInTaxaBlock = false;
                    break;

                default:
                    break;
                }
            }

            // Read the end line "end;"
            mbfReader.GoToNextLine();

            // Validate the count
            if (sequenceCount != IDs.Count)
            {
                throw new InvalidDataException(Properties.Resource.NtaxMismatch);
            }

            return(IDs);
        }