Example #1
0
        /// <summary>
        /// Parse Nexus Header
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        private void ParseHeader(BioTextReader bioReader)
        {
            string message = string.Empty;

            if (!bioReader.Line.StartsWith("#NEXUS", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            bioReader.GoToNextLine();  // Skip blank lines until we get to the first block.

            // Title of Alignment
            if (bioReader.Line.Trim().StartsWith("[", StringComparison.OrdinalIgnoreCase))
            {
                while (bioReader.HasLines)
                {
                    bioReader.GoToNextLine();
                    if (bioReader.Line.Trim().EndsWith("]", StringComparison.OrdinalIgnoreCase))
                    {
                        break;
                    }
                }
            }

            bioReader.GoToNextLine();

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            bioReader.SkipBlankLines = false;
        }
Example #2
0
        // returns a string of the data for a header block that spans multiple lines
        private static string ParseMultiLineData(BioTextReader bioReader, string lineBreakSubstitution)
        {
            string data = bioReader.LineData;

            bioReader.GoToNextLine();

            // while succeeding lines start with no header, add to data
            while (bioReader.HasLines && !bioReader.LineHasHeader)
            {
                data += lineBreakSubstitution + bioReader.LineData;
                bioReader.GoToNextLine();
            }

            return(data);
        }
Example #3
0
        /// <summary>
        /// Parses a list of sequences using a BioTextReader.
        /// </summary>
        /// <remarks>
        /// This method should be overridden by any parsers that need to process file-scope
        /// metadata that applies to all of the sequences in the file.
        /// </remarks>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequences should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequences's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The list of parsed ISequence objects.</returns>
        protected virtual IList <ISequenceAlignment> Parse(BioTextReader bioReader, bool isReadOnly)
        {
            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            // no empty files allowed
            if (!bioReader.HasLines)
            {
                string message = Properties.Resource.IONoTextToParse;
                throw new InvalidDataException(message);
            }

            List <ISequenceAlignment> alignments = new List <ISequenceAlignment>();

            // Parse Header, Loop through the blocks and parse
            while (bioReader.HasLines)
            {
                if (string.IsNullOrEmpty(bioReader.Line.Trim()))
                {
                    bioReader.GoToNextLine();
                    continue;
                }

                alignments.Add(ParseOneWithSpecificFormat(bioReader, isReadOnly));
            }

            return(alignments);
        }
Example #4
0
        // parses sequence.
        private void ParseSequences(SequenceAlignmentMap seqAlignment, BioTextReader bioReader, bool isReadOnly)
        {
            while (bioReader.HasLines && !bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
            {
                string[]           tokens     = bioReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries);
                SAMAlignedSequence alignedSeq = new SAMAlignedSequence();

                alignedSeq.QName = tokens[0];
                alignedSeq.Flag  = SAMAlignedSequenceHeader.GetFlag(tokens[1]);
                alignedSeq.RName = tokens[2];
                alignedSeq.Pos   = int.Parse(tokens[3], CultureInfo.InvariantCulture);
                alignedSeq.MapQ  = int.Parse(tokens[4], CultureInfo.InvariantCulture);
                alignedSeq.CIGAR = tokens[5];
                alignedSeq.MRNM  = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6];
                alignedSeq.MPos  = int.Parse(tokens[7], CultureInfo.InvariantCulture);
                alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture);
                string message = alignedSeq.IsValidHeader();

                if (!string.IsNullOrEmpty(message))
                {
                    throw new FormatException(message);
                }

                ISequence refSeq = null;

                if (RefSequences != null && RefSequences.Count > 0)
                {
                    refSeq = RefSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0);
                }

                ParseQualityNSequence(alignedSeq, Alphabet, Encoding, tokens[9], tokens[10], refSeq, isReadOnly);
                SAMOptionalField optField = null;
                for (int i = 11; i < tokens.Length; i++)
                {
                    optField = new SAMOptionalField();
                    string optionalFieldRegExpn = OptionalFieldLinePattern;
                    if (!Helper.IsValidRegexValue(optionalFieldRegExpn, tokens[i]))
                    {
                        message = string.Format(CultureInfo.CurrentCulture, Resource.InvalidOptionalField, tokens[i]);
                        throw new FormatException(message);
                    }

                    string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries);
                    optField.Tag   = opttokens[0];
                    optField.VType = opttokens[1];
                    optField.Value = opttokens[2];
                    message        = optField.IsValid();
                    if (!string.IsNullOrEmpty(message))
                    {
                        throw new FormatException(message);
                    }

                    alignedSeq.OptionalFields.Add(optField);
                }

                seqAlignment.QuerySequences.Add(alignedSeq);
                bioReader.GoToNextLine();
            }
        }
Example #5
0
        /// <summary>
        /// Read XML BLAST data from the reader, and build one or more
        /// BlastRecordGroup objects (each containing one or more
        /// BlastSearchRecord results).
        /// </summary>
        /// <param name="reader">The text source</param>
        /// <returns>A list of BLAST iteration objects</returns>
        public IList <BlastResult> Parse(TextReader reader)
        {
            List <BlastResult> records = new List <BlastResult>();
            StringBuilder      sb      = new StringBuilder();

            using (BioTextReader bioreader = new BioTextReader(reader))
            {
                bioreader.SkipBlankLines = false;
                while (bioreader.HasLines)
                {
                    if (bioreader.Line.StartsWith("RPS-BLAST", StringComparison.OrdinalIgnoreCase))
                    {
                        bioreader.GoToNextLine();
                        continue;
                    }
                    if (bioreader.Line.StartsWith("<?xml version", StringComparison.OrdinalIgnoreCase) &&
                        bioreader.LineNumber > 1)
                    {
                        records.Add(ParseXML(sb));
                        sb = new StringBuilder();
                    }
                    sb.AppendLine(bioreader.Line);
                    bioreader.GoToNextLine();
                }
            }
            if (sb.Length > 0)
            {
                records.Add(ParseXML(sb));
            }
            if (records.Count == 0)
            {
                string message = Properties.Resource.BlastNoRecords;
                Trace.Report(message);
                throw new FormatException(message);
            }
            return(records);
        }
Example #6
0
        /// <summary>
        /// Parses SequenceAlignmentMap using a BioTextReader.
        /// </summary>
        /// <param name="bioReader">A reader for a sequence alignment text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether sequences in the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequences's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>The list of parsed ISequenceAlignment objects.</returns>
        private SequenceAlignmentMap Parse(BioTextReader bioReader, bool isReadOnly)
        {
            // Parse Header, Loop through the blocks and parse
            while (bioReader.HasLines)
            {
                if (string.IsNullOrEmpty(bioReader.Line.Trim()))
                {
                    bioReader.GoToNextLine();
                    continue;
                }

                return(ParseOneWithSpecificFormat(bioReader, isReadOnly));
            }

            return(null);
        }
Example #7
0
        /// <summary>
        /// Parses SAM alignment header from specified BioTextReader.
        /// </summary>
        /// <param name="bioReader">Bio text reader.</param>
        private static SAMAlignmentHeader ParserSAMHeader(BioTextReader bioReader)
        {
            SAMAlignmentHeader samHeader = new SAMAlignmentHeader();

            if (bioReader.HasLines && bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
            {
                while (bioReader.HasLines && bioReader.Line.StartsWith(@"@", StringComparison.OrdinalIgnoreCase))
                {
                    string[] tokens         = bioReader.Line.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries);
                    string   recordTypecode = tokens[0].Substring(1);
                    // Validate the header format.
                    ValidateHeaderLineFormat(bioReader.Line);

                    SAMRecordField headerLine = null;
                    if (string.Compare(recordTypecode, "CO", StringComparison.OrdinalIgnoreCase) != 0)
                    {
                        List <string> tags = new List <string>();
                        headerLine = new SAMRecordField(recordTypecode);
                        for (int i = 1; i < tokens.Length; i++)
                        {
                            string tagToken = tokens[i];
                            string tagName  = tagToken.Substring(0, 2);
                            tags.Add(tagName);
                            headerLine.Tags.Add(new SAMRecordFieldTag(tagName, tagToken.Substring(3)));
                        }

                        samHeader.RecordFields.Add(headerLine);
                    }
                    else
                    {
                        samHeader.Comments.Add(bioReader.Line.Substring(4));
                    }

                    bioReader.GoToNextLine();
                }

                string message = samHeader.IsValid();
                if (!string.IsNullOrEmpty(message))
                {
                    throw new FormatException(message);
                }
            }

            return(samHeader);
        }
Example #8
0
        // Parses the consecutive feature lines for one sequence.
        private void ParseFeatures(BioTextReader bioReader)
        {
            // The non-comment lines contain features, which are each stored as MetadataListItems.
            // The fields of each feature are referred to as sub-items.  For GFF, these have
            // unique keys, but for compatability with our internal representation of features from
            // GenBank format, each sub-item is a list of strings, rather than a simple string.
            List <MetadataListItem <List <string> > > featureList = null;

            Sequence specificSeq = null;

            while (bioReader.HasLines)
            {
                if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    // ignore comments
                    bioReader.GoToNextLine();
                }
                else
                {
                    // fields are tab-delimited
                    string[] featureFields = bioReader.Line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);

                    if (featureFields.Length < _minFieldsPerFeature ||
                        featureFields.Length > _maxFieldsPerFeature)
                    {
                        string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);;
                        throw new InvalidDataException(message);
                    }

                    // The featureFields array should now contain the following fields:
                    //      featureFields[0]: sequence name
                    //      featureFields[1]: source
                    //      featureFields[2]: feature name
                    //      featureFields[3]: start
                    //      featureFields[4]: end
                    //      featureFields[5]: score
                    //      featureFields[6]: strand
                    //      featureFields[7]: frame
                    //      featureFields[8]: attributes (optional)

                    // Process sequence name.
                    if (specificSeq == null)
                    {
                        specificSeq = GetSpecificSequence(featureFields[0], MoleculeType.Invalid, bioReader);

                        // Retrieve features list, or add empty features list to metadata if this
                        // is the first feature.
                        if (specificSeq.Metadata.ContainsKey("features"))
                        {
                            featureList = specificSeq.Metadata["features"] as
                                          List <MetadataListItem <List <string> > >;
                        }
                        else
                        {
                            featureList = new List <MetadataListItem <List <string> > >();
                            specificSeq.Metadata["features"] = featureList;
                        }
                    }
                    else if (specificSeq.DisplayID != featureFields[0])
                    {
                        // don't go to next line; current line still needs to be processed
                        break;
                    }

                    // use feature name as key; attributes field is stored as free text
                    string attributes = (featureFields.Length == 9 ? featureFields[8] : string.Empty);
                    MetadataListItem <List <string> > feature = new MetadataListItem <List <string> >(featureFields[2], attributes);

                    // source
                    feature.SubItems.Add("source", new List <string> {
                        featureFields[1]
                    });

                    // start is an int
                    int ignoreMe;
                    if (!int.TryParse(featureFields[3], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.GffInvalidField,
                            "start",
                            featureFields[3]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    feature.SubItems.Add("start", new List <string> {
                        featureFields[3]
                    });

                    // end is an int
                    if (!int.TryParse(featureFields[4], out ignoreMe))
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.GffInvalidField,
                            "end",
                            featureFields[4]);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    feature.SubItems.Add("end", new List <string> {
                        featureFields[4]
                    });

                    // source is a double, or a dot as a space holder
                    if (featureFields[5] != ".")
                    {
                        double ignoreMeToo;
                        if (!double.TryParse(featureFields[5], out ignoreMeToo))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "score",
                                featureFields[5]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("score", new List <string> {
                            featureFields[5]
                        });
                    }

                    // strand is + or -, or a dot as a space holder
                    if (featureFields[6] != ".")
                    {
                        if (featureFields[6] != "+" && featureFields[6] != "-")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "strand",
                                featureFields[6]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }
                        feature.SubItems.Add("strand", new List <string> {
                            featureFields[6]
                        });
                    }

                    // frame is an int, or a dot as a space holder
                    if (featureFields[7] != ".")
                    {
                        if (!int.TryParse(featureFields[7], out ignoreMe))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffInvalidField,
                                "frame",
                                featureFields[7]);
                            Trace.Report(message);
                            throw new InvalidDataException(message);
                        }

                        feature.SubItems.Add("frame", new List <string> {
                            featureFields[7]
                        });
                    }

                    // done with that one
                    featureList.Add(feature);
                    bioReader.GoToNextLine();
                }
            }

            // A feature file with no features?  May it never be.
            if (featureList == null)
            {
                string message = Properties.Resource.GFFNoFeatures;
                Trace.Report(message);
                throw new InvalidOperationException(message);
            }
        }
Example #9
0
        /// <summary>
        /// Parses a single Phylip text from a reader into a sequence.
        /// 1. First link has Count of Taxa and length of each sequence
        /// 2. Sequences
        ///     a. First ten character are ID
        ///     b. Sequence itself
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.</param>
        /// <returns>A new Sequence Alignment instance containing parsed data.</returns>
        protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly)
        {
            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            string message = string.Empty;

            // Parse first line
            IList <string> tokens = GetTokens(bioReader.Line);

            if (2 != tokens.Count)
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            bool             isFirstBlock      = true;
            int              sequenceCount     = 0;
            int              sequenceLength    = 0;
            IList <Sequence> data              = new List <Sequence>();
            string           id                = string.Empty;
            string           sequenceString    = string.Empty;
            Sequence         sequence          = null;
            IAlphabet        alignmentAlphabet = null;

            sequenceCount  = Int32.Parse(tokens[0], CultureInfo.InvariantCulture);
            sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture);

            bioReader.GoToNextLine();  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            bioReader.SkipBlankLines = false;

            while (bioReader.HasLines)
            {
                if (string.IsNullOrEmpty(bioReader.Line.Trim()))
                {
                    bioReader.GoToNextLine();
                    continue;
                }

                for (int index = 0; index < sequenceCount; index++)
                {
                    if (isFirstBlock)
                    {
                        tokens = GetTokens(bioReader.Line);

                        if (1 == tokens.Count)
                        {
                            id             = tokens[0].Substring(0, 10);
                            sequenceString = tokens[0].Substring(10);
                        }
                        else
                        {
                            id             = tokens[0];
                            sequenceString = tokens[1];
                        }

                        IAlphabet alphabet = Alphabet;
                        if (null == alphabet)
                        {
                            alphabet = _basicParser.IdentifyAlphabet(alphabet, sequenceString);

                            if (null == alphabet)
                            {
                                message = string.Format(
                                    CultureInfo.InvariantCulture,
                                    Resource.InvalidSymbolInString,
                                    sequenceString);
                                throw new InvalidDataException(message);
                            }
                            else
                            {
                                if (null == alignmentAlphabet)
                                {
                                    alignmentAlphabet = alphabet;
                                }
                                else
                                {
                                    if (alignmentAlphabet != alphabet)
                                    {
                                        message = Properties.Resource.SequenceAlphabetMismatch;
                                        throw new InvalidDataException(message);
                                    }
                                }
                            }
                        }

                        if (Encoding == null)
                        {
                            sequence = new Sequence(alphabet, sequenceString);
                        }
                        else
                        {
                            sequence = new Sequence(alphabet, Encoding, sequenceString);
                        }

                        sequence.ID         = id;
                        sequence.IsReadOnly = false;
                        data.Add(sequence);
                    }
                    else
                    {
                        sequence = data[index];
                        sequence.InsertRange(sequence.Count, bioReader.Line.Trim());
                    }

                    bioReader.GoToNextLine();
                }

                // Reset the first block flag
                isFirstBlock = false;
            }

            // Validate for the count of sequence
            if (sequenceCount != data.Count)
            {
                throw new InvalidDataException(Properties.Resource.SequenceCountMismatch);
            }

            SequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());

            foreach (Sequence dataSequence in data)
            {
                dataSequence.IsReadOnly = isReadOnly;

                // Validate for the count of sequence
                if (sequenceLength != dataSequence.Count)
                {
                    throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch);
                }

                sequenceAlignment.AlignedSequences[0].Sequences.Add(dataSequence);
            }

            return(sequenceAlignment);
        }
Example #10
0
        /// <summary>
        /// Gets the list of sequence titles
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <returns>List of sequence IDs</returns>
        private static IList <string> ParseTaxaBlock(BioTextReader bioReader)
        {
            bool           isInTaxaBlock = true;
            string         data          = string.Empty;
            int            sequenceCount = 0;
            IList <string> IDs           = new List <string>();

            while (bioReader.HasLines && isInTaxaBlock)
            {
                bioReader.GoToNextLine();
                IList <string> tokens = GetTokens(bioReader.Line);
                switch (tokens[0].ToUpper(CultureInfo.InvariantCulture))
                {
                case "DIMENSIONS":
                    tokens[0] = string.Empty;

                    // Parse dimensions
                    // 1. Read count of sequence
                    do
                    {
                        foreach (string token in tokens)
                        {
                            data = token.Trim(new char[] { ';' });

                            if (string.IsNullOrEmpty(data))
                            {
                                continue;
                            }

                            if (data.StartsWith("ntax=", StringComparison.OrdinalIgnoreCase))
                            {
                                sequenceCount = Int32.Parse(data.Substring(5), CultureInfo.InvariantCulture);
                            }
                        }

                        if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            bioReader.GoToNextLine();
                            tokens = GetTokens(bioReader.Line);
                        }
                    }while (bioReader.HasLines);

                    break;

                case "TAXLABELS":
                case "TAXLABELS;":
                    tokens[0] = string.Empty;

                    // Parse taxlabels
                    // 1. Read IDs of sequence
                    do
                    {
                        foreach (string token in tokens)
                        {
                            data = token.Trim(new char[] { ';' });

                            if (string.IsNullOrEmpty(data))
                            {
                                continue;
                            }

                            IDs.Add(data);
                        }

                        if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            bioReader.GoToNextLine();
                            tokens = GetTokens(bioReader.Line);
                        }
                    }while (bioReader.HasLines);

                    break;

                case "END":
                case "END;":
                    // Have reached the end of taxa block
                    isInTaxaBlock = false;
                    break;

                default:
                    break;
                }
            }

            // Read the end line "end;"
            bioReader.GoToNextLine();

            // Validate the count
            if (sequenceCount != IDs.Count)
            {
                throw new InvalidDataException(Properties.Resource.NtaxMismatch);
            }

            return(IDs);
        }
Example #11
0
        // parses everything before the features section
        private void ParseHeaders(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            string          data     = string.Empty;

            string[] tokens = null;
            // set data indent for headers
            bioReader.DataIndent = _dataIndent;

            // only allow one locus line
            bool haveParsedLocus = false;

            // parse until we hit the features or sequence section
            bool haveFinishedHeaders = false;

            while (bioReader.HasLines && !haveFinishedHeaders)
            {
                switch (bioReader.LineHeader)
                {
                case "LOCUS":
                    if (haveParsedLocus)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserSecondLocus,
                            bioReader.LocationString);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    ParseLocus(bioReader, ref sequence);
                    metadata        = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    haveParsedLocus = true;
                    // don't go to next line; current line still needs to be processed
                    break;

                case "VERSION":
                    tokens = bioReader.LineData.Split(new char[] { ' ' },
                                                      StringSplitOptions.RemoveEmptyEntries);
                    // first token contains accession and version
                    Match m = Regex.Match(tokens[0], @"^(?<accession>\w+)\.(?<version>\d+)$");
                    metadata.Version = new GenBankVersion();

                    if (m.Success)
                    {
                        metadata.Version.Version = m.Groups["version"].Value;
                        // The first token in the data from the accession line is referred to as
                        // the primary accession number, and should be the one used here in the
                        // version line.
                        string versionLineAccession = m.Groups["accession"].Value;
                        if (metadata.Accession == null)
                        {
                            ApplicationLog.WriteLine("WARN: VERSION processed before ACCESSION");
                        }
                        else
                        {
                            if (!versionLineAccession.Equals(metadata.Accession.Primary))
                            {
                                ApplicationLog.WriteLine("WARN: VERSION tag doesn't match ACCESSION");
                            }
                            else
                            {
                                metadata.Version.Accession = metadata.Accession.Primary;
                            }
                        }
                    }
                    // second token contains primary ID
                    m = Regex.Match(tokens[1], @"^GI:(?<primaryID>.*)");
                    if (m.Success)
                    {
                        metadata.Version.GINumber = m.Groups["primaryID"].Value;
                    }
                    bioReader.GoToNextLine();
                    break;

                case "PROJECT":
                    tokens = bioReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.Project      = new ProjectIdentifier();
                        metadata.Project.Name = tokens[0];
                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.Project.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected PROJECT header: " + bioReader.Line);
                    }
                    bioReader.GoToNextLine();
                    break;

                case "SOURCE":
                    ParseSource(bioReader, ref sequence);
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "REFERENCE":
                    ParseReferences(bioReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "COMMENT":
                    ParseComments(bioReader, ref sequence);       // can encounter more than one
                    metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
                    // don't go to next line; current line still needs to be processed
                    break;

                case "PRIMARY":
                    // This header is followed by sequence info in a table format that could be
                    // stored in a custom object.  The first line contains column headers.
                    // For now, just validate the presence of the headers, and save the data
                    // as a string.
                    int[] locs = new int[4];
                    locs[0] = bioReader.LineData.IndexOf("TPA_SPAN", StringComparison.Ordinal);
                    locs[1] = bioReader.LineData.IndexOf("PRIMARY_IDENTIFIER", StringComparison.Ordinal);
                    locs[2] = bioReader.LineData.IndexOf("PRIMARY_SPAN", StringComparison.Ordinal);
                    locs[3] = bioReader.LineData.IndexOf("COMP", StringComparison.Ordinal);
                    if (locs[0] < 0 || locs[1] < 0 || locs[2] < 0 || locs[3] < 0)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserPrimaryLineError,
                            bioReader.Line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                    string primaryData = ParseMultiLineData(bioReader, Environment.NewLine);
                    metadata.Primary = primaryData;
                    // don't go to next line; current line still needs to be processed
                    break;

                // all the following are extracted the same way - possibly multiline
                case "DEFINITION":
                    metadata.Definition = ParseMultiLineData(bioReader, " ");
                    break;

                case "ACCESSION":
                    data = ParseMultiLineData(bioReader, " ");
                    metadata.Accession = new GenBankAccession();
                    string[] accessions = data.Split(' ');
                    metadata.Accession.Primary = accessions[0];

                    for (int i = 1; i < accessions.Length; i++)
                    {
                        metadata.Accession.Secondary.Add(accessions[i]);
                    }
                    break;

                case "DBLINK":
                    tokens = bioReader.LineData.Split(':');
                    if (tokens.Length == 2)
                    {
                        metadata.DBLink = new CrossReferenceLink();
                        if (string.Compare(tokens[0],
                                           CrossReferenceType.Project.ToString(),
                                           StringComparison.OrdinalIgnoreCase) == 0)
                        {
                            metadata.DBLink.Type = CrossReferenceType.Project;
                        }
                        else
                        {
                            metadata.DBLink.Type = CrossReferenceType.TraceAssemblyArchive;
                        }

                        tokens = tokens[1].Split(',');
                        for (int i = 0; i < tokens.Length; i++)
                        {
                            metadata.DBLink.Numbers.Add(tokens[i]);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected DBLINK header: " + bioReader.Line);
                    }
                    bioReader.GoToNextLine();
                    break;

                case "DBSOURCE":
                    metadata.DBSource = ParseMultiLineData(bioReader, " ");
                    break;

                case "KEYWORDS":
                    metadata.Keywords = ParseMultiLineData(bioReader, " ");
                    break;

                case "SEGMENT":
                    data = ParseMultiLineData(bioReader, " ");
                    string delimeter = "of";
                    tokens = data.Split(delimeter.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
                    int outvalue;
                    if (tokens.Length == 2)
                    {
                        metadata.Segment = new SequenceSegment();
                        if (int.TryParse(tokens[0].Trim(), out outvalue))
                        {
                            metadata.Segment.Current = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line);
                        }

                        if (int.TryParse(tokens[1].Trim(), out outvalue))
                        {
                            metadata.Segment.Count = outvalue;
                        }
                        else
                        {
                            ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line);
                        }
                    }
                    else
                    {
                        ApplicationLog.WriteLine("WARN: unexpected SEGMENT header: " + bioReader.Line);
                    }
                    break;

                // all the following indicate sections beyond the headers parsed by this method
                case "FEATURES":
                case "BASE COUNT":
                case "ORIGIN":
                case "CONTIG":
                    haveFinishedHeaders = true;
                    break;

                default:
                    ApplicationLog.WriteLine(ToString() + "WARN: unknown {0} -> {1}", bioReader.LineHeader, bioReader.LineData);
                    string errMessage = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParseHeaderError,
                        bioReader.LineHeader);
                    Trace.Report(errMessage);
                    throw new InvalidDataException(errMessage);
                }
            }

            // check for required features
            if (!haveParsedLocus)
            {
                string message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
        }
Example #12
0
        // Processes headers, which are a type of comment.
        private void ParseHeaders(BioTextReader bioReader)
        {
            while (bioReader.HasLines && bioReader.Line.StartsWith(_commentMark, StringComparison.Ordinal))
            {
                Sequence specificSeq = null;

                // process headers, but ignore other comments
                if (bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                {
                    string[] fields = bioReader.GetLineField(3).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                    switch (fields[0].ToUpperInvariant())
                    {
                    case "GFF-VERSION":
                        if (fields.Length > 1 && fields[1] != "2")
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.GffUnsupportedVersion,
                                bioReader.LocationString);
                            Trace.Report(message);
                            throw new NotSupportedException(message);
                        }
                        // don't store this
                        break;

                    case "SOURCE-VERSION":
                        _commonSeq.Metadata["source"]  = fields[1];
                        _commonSeq.Metadata["version"] = fields[2];
                        break;

                    case "DATE":
                        DateTime date;
                        if (!DateTime.TryParse(fields[1], out date))
                        {
                            string message = String.Format(
                                CultureInfo.CurrentCulture,
                                Properties.Resource.ParserInvalidDate,
                                bioReader.LocationString);
                            Trace.Report(message);
                            throw new FormatException(message);
                        }

                        _commonSeq.Metadata["date"] = date;
                        break;

                    case "TYPE":
                        if (fields.Length == 2)
                        {
                            _commonSeq.MoleculeType = GetMoleculeType(fields[1]);
                            if (_commonSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    bioReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }
                        }
                        else
                        {
                            specificSeq = GetSpecificSequence(fields[2], GetMoleculeType(fields[1]), bioReader);

                            if (specificSeq.MoleculeType == MoleculeType.Invalid)
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.InvalidType,
                                    bioReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }
                        }
                        break;

                    case "DNA":
                    case "RNA":
                    case "PROTEIN":
                        specificSeq = GetSpecificSequence(fields[1], GetMoleculeType(fields[0]), bioReader);
                        bioReader.GoToNextLine();

                        while (bioReader.HasLines && bioReader.Line != "##end-" + fields[0])
                        {
                            if (!bioReader.Line.StartsWith(_headerMark, StringComparison.Ordinal))
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GffInvalidSequence,
                                    bioReader.LocationString);
                                Trace.Report(message);
                                throw new FormatException(message);
                            }

                            specificSeq.InsertRange(specificSeq.Count, bioReader.GetLineField(3));

                            bioReader.GoToNextLine();
                        }

                        break;

                    case "SEQUENCE-REGION":
                        specificSeq = GetSpecificSequence(fields[1], MoleculeType.Invalid, bioReader);
                        specificSeq.Metadata["start"] = fields[2];
                        specificSeq.Metadata["end"]   = fields[3];
                        break;
                    }
                }

                bioReader.GoToNextLine();
            }
        }
Example #13
0
        /// <summary>
        /// Parses a single Nexus text from a reader into a sequence.
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.</param>
        /// <returns>A new Sequence instance containing parsed data.</returns>
        protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly)
        {
            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            ParseHeader(bioReader);

            string             message           = string.Empty;
            ISequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());
            IList <string> ids       = null;
            bool           isInBlock = true;

            if (bioReader.Line.StartsWith("begin", StringComparison.OrdinalIgnoreCase))
            {
                while (bioReader.HasLines && isInBlock)
                {
                    if (string.IsNullOrEmpty(bioReader.Line.Trim()))
                    {
                        bioReader.GoToNextLine();
                        continue;
                    }

                    string blockName = GetTokens(bioReader.Line)[1];

                    switch (blockName.ToUpper(CultureInfo.InvariantCulture))
                    {
                    case "TAXA":
                    case "TAXA;":
                        // This block contains the count of sequence & title of each sequence
                        ids = (IList <string>)ParseTaxaBlock(bioReader);

                        break;

                    case "CHARACTERS":
                    case "CHARACTERS;":
                        // Block contains sequences
                        Dictionary <string, string> dataSet = ParseCharacterBlock(bioReader, ids);

                        IAlphabet alignmentAlphabet = null;
                        string    data = string.Empty;

                        foreach (string ID in ids)
                        {
                            IAlphabet alphabet = Alphabet;
                            Sequence  sequence = null;
                            data = dataSet[ID];

                            if (null == alphabet)
                            {
                                alphabet = _basicParser.IdentifyAlphabet(alphabet, data);

                                if (null == alphabet)
                                {
                                    message = string.Format(
                                        CultureInfo.InvariantCulture,
                                        Resource.InvalidSymbolInString,
                                        data);
                                    throw new InvalidDataException(message);
                                }
                                else
                                {
                                    if (null == alignmentAlphabet)
                                    {
                                        alignmentAlphabet = alphabet;
                                    }
                                    else
                                    {
                                        if (alignmentAlphabet != alphabet)
                                        {
                                            message = string.Format(
                                                CultureInfo.InvariantCulture,
                                                Properties.Resource.SequenceAlphabetMismatch);
                                            throw new InvalidDataException(message);
                                        }
                                    }
                                }
                            }

                            if (Encoding == null)
                            {
                                sequence = new Sequence(alphabet, data);
                            }
                            else
                            {
                                sequence = new Sequence(alphabet, Encoding, data);
                            }

                            sequence.IsReadOnly = isReadOnly;
                            sequence.ID         = ID;
                            sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence);
                        }

                        break;

                    case "END":
                    case "END;":
                        // Have reached the end of block
                        isInBlock = false;

                        break;

                    default:
                        // skip this block
                        while (bioReader.HasLines)
                        {
                            bioReader.GoToNextLine();
                            if (0 == string.Compare(bioReader.Line, "end;", StringComparison.OrdinalIgnoreCase))
                            {
                                break;
                            }
                        }

                        break;
                    }

                    bioReader.GoToNextLine();
                }
            }

            return(sequenceAlignment);
        }
Example #14
0
        // Handle optional BASE COUNT, then ORIGIN and sequence data.
        private void ParseSequence(BioTextReader bioReader, ref Sequence sequence)
        {
            string message = string.Empty;

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            // set data indent for sequence headers
            bioReader.DataIndent = _dataIndent;

            while (bioReader.HasLines)
            {
                if (bioReader.Line.StartsWith("//", StringComparison.Ordinal))
                {
                    bioReader.GoToNextLine();
                    break; // end of sequence record
                }

                switch (bioReader.LineHeader)
                {
                case "BASE COUNT":
                    // The BASE COUNT linetype is obsolete and was removed
                    // from the GenBank flatfile format in October 2003.  But if it is
                    // present, we will use it.  We get the untrimmed version since it
                    // starts with a right justified column.
                    metadata.BaseCount = bioReader.Line.Substring(_dataIndent);
                    bioReader.GoToNextLine();
                    break;

                case "ORIGIN":
                    // The origin line can contain optional data; don't put empty string into
                    // metadata.
                    if (!String.IsNullOrEmpty(bioReader.LineData))
                    {
                        metadata.Origin = bioReader.LineData;
                    }
                    bioReader.GoToNextLine();
                    IAlphabet alphabet = null;
                    while (bioReader.HasLines && bioReader.Line[0] == ' ')
                    {
                        // Using a regex is too slow.
                        int len = bioReader.Line.Length;
                        int k   = 10;
                        while (k < len)
                        {
                            string seqData = bioReader.Line.Substring(k, Math.Min(10, len - k));
                            if (Alphabet == null)
                            {
                                alphabet = IdentifyAlphabet(alphabet, seqData);

                                if (alphabet == null)
                                {
                                    message = String.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, bioReader.Line);
                                    Trace.Report(message);
                                    throw new InvalidDataException(message);
                                }

                                if (sequence.Alphabet != alphabet)
                                {
                                    Sequence seq = new Sequence(alphabet, Encoding, sequence);
                                    seq.MoleculeType = sequence.MoleculeType;
                                    seq.IsReadOnly   = false;
                                    sequence.Clear();
                                    sequence = seq;
                                }
                            }

                            sequence.InsertRange(sequence.Count, seqData);
                            k += 11;
                        }

                        bioReader.GoToNextLine();
                    }
                    break;

                case "CONTIG":
                    metadata.Contig = ParseMultiLineData(bioReader, Environment.NewLine);
                    // don't go to next line; current line still needs to be processed
                    break;

                default:
                    message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserUnexpectedLineInSequence,
                        bioReader.Line);
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
            }
        }
Example #15
0
        private void ParseFeatures(BioTextReader bioReader, ref Sequence sequence)
        {
            ILocationBuilder locBuilder = LocationBuilder;

            if (locBuilder == null)
            {
                throw new InvalidOperationException(Resource.NullLocationBuild);
            }

            // set data indent for features
            bioReader.DataIndent = _featureDataIndent;

            // The sub-items of a feature are referred to as qualifiers.  These do not have unique
            // keys, so they are stored as lists in the SubItems dictionary.
            SequenceFeatures    features    = new SequenceFeatures();
            IList <FeatureItem> featureList = features.All;

            while (bioReader.HasLines)
            {
                if (String.IsNullOrEmpty(bioReader.Line) || bioReader.LineHeader == "FEATURES")
                {
                    bioReader.GoToNextLine();
                    continue;
                }

                if (bioReader.Line[0] != ' ')
                {
                    // start of non-feature text
                    break;
                }

                if (!bioReader.LineHasHeader)
                {
                    string message = Properties.Resource.GenbankEmptyFeature;
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }

                // check for multi-line location string
                string featureKey = bioReader.LineHeader;
                string location   = bioReader.LineData;
                bioReader.GoToNextLine();
                while (bioReader.HasLines && !bioReader.LineHasHeader &&
                       bioReader.LineHasData && !bioReader.LineData.StartsWith("/", StringComparison.Ordinal))
                {
                    location += bioReader.LineData;
                    bioReader.GoToNextLine();
                }

                // create features as MetadataListItems
                FeatureItem feature = new FeatureItem(featureKey, locBuilder.GetLocation(location));

                // process the list of qualifiers, which are each in the form of
                // /key="value"
                string qualifierKey   = string.Empty;
                string qualifierValue = string.Empty;
                while (bioReader.HasLines)
                {
                    if (!bioReader.LineHasHeader && bioReader.LineHasData)
                    {
                        // '/' denotes a continuation of the previous line
                        if (bioReader.LineData.StartsWith("/", StringComparison.Ordinal))
                        {
                            // new qualifier; save previous if this isn't the first
                            if (!String.IsNullOrEmpty(qualifierKey))
                            {
                                AddQualifierToFeature(feature, qualifierKey, qualifierValue);
                            }

                            // set the key and value of this qualifier
                            int equalsIndex = bioReader.LineData.IndexOf('=');
                            if (equalsIndex < 0)
                            {
                                // no value, just key (this is allowed, see NC_005213.gbk)
                                qualifierKey   = bioReader.LineData.Substring(1);
                                qualifierValue = string.Empty;
                            }
                            else if (equalsIndex > 0)
                            {
                                qualifierKey   = bioReader.LineData.Substring(1, equalsIndex - 1);
                                qualifierValue = bioReader.LineData.Substring(equalsIndex + 1);
                            }
                            else
                            {
                                string message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    Properties.Resource.GenbankInvalidFeature,
                                    bioReader.Line);
                                Trace.Report(message);
                                throw new InvalidDataException(message);
                            }
                        }
                        else
                        {
                            // Continuation of previous line; "note" gets a line break, and
                            // everything else except "translation" and "transl_except" gets a
                            // space to separate words.
                            if (qualifierKey == "note")
                            {
                                qualifierValue += Environment.NewLine;
                            }
                            else if (qualifierKey != "translation" && qualifierKey != "transl_except")
                            {
                                qualifierValue += " ";
                            }

                            qualifierValue += bioReader.LineData;
                        }

                        bioReader.GoToNextLine();
                    }
                    else if (bioReader.Line.StartsWith("\t", StringComparison.Ordinal))
                    {
                        // this seems to be data corruption; but BioPerl test set includes
                        // (old, 2003) NT_021877.gbk which has this problem, so we
                        // handle it
                        ApplicationLog.WriteLine("WARN: nonstandard line format at line {0}: '{1}'",
                                                 bioReader.LineNumber, bioReader.Line);
                        qualifierValue += " " + bioReader.Line.Trim();
                        bioReader.GoToNextLine();
                    }
                    else
                    {
                        break;
                    }
                }

                // add last qualifier
                if (!String.IsNullOrEmpty(qualifierKey))
                {
                    AddQualifierToFeature(feature, qualifierKey, qualifierValue);
                }

                // still add feature, even if it has no qualifiers
                featureList.Add(StandardFeatureMap.GetStandardFeatureItem(feature));
            }

            if (featureList.Count > 0)
            {
                ((GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey]).Features = features;
            }
        }
Example #16
0
        private static void ParseSource(BioTextReader bioReader, ref Sequence sequence)
        {
            string source      = string.Empty;
            string organism    = string.Empty;
            string classLevels = string.Empty;

            while (bioReader.HasLines)
            {
                if (bioReader.LineHeader == "SOURCE")
                {
                    // data can be multiline. spec says last line must end with period
                    // (note: this doesn't apply unless multiline)
                    bool lastDotted = true;
                    source = bioReader.LineData;

                    bioReader.GoToNextLine();
                    while (bioReader.HasLines && !bioReader.LineHasHeader)
                    {
                        source    += " " + bioReader.LineData;
                        lastDotted = (source.EndsWith(".", StringComparison.Ordinal));
                        bioReader.GoToNextLine();
                    }

                    if (!lastDotted && Trace.Want(Trace.SeqWarnings))
                    {
                        Trace.Report("GenBank.ParseSource", Properties.Resource.OutOfSpec, source);
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else if (bioReader.Line[0] == ' ')
                {
                    if (bioReader.LineHeader != "ORGANISM")
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidSourceField,
                            bioReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // this also can be multiline
                    organism = bioReader.LineData;

                    bioReader.GoToNextLine();
                    while (bioReader.HasLines && !bioReader.LineHasHeader)
                    {
                        if (bioReader.Line.EndsWith(";", StringComparison.Ordinal) || bioReader.Line.EndsWith(".", StringComparison.Ordinal))
                        {
                            if (!String.IsNullOrEmpty(classLevels))
                            {
                                classLevels += " ";
                            }

                            classLevels += bioReader.LineData;
                        }
                        else
                        {
                            organism += " " + bioReader.LineData;
                        }
                        bioReader.GoToNextLine();
                    }

                    // don't go to next line; current line still needs to be processed
                }
                else
                {
                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }

            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Source            = new SequenceSource();
            metadata.Source.CommonName = source;
            if (!string.IsNullOrEmpty(organism))
            {
                int index = organism.IndexOf(" ", StringComparison.Ordinal);
                if (index > 0)
                {
                    metadata.Source.Organism.Genus = organism.Substring(0, index);
                    if (organism.Length > index)
                    {
                        index++;
                        metadata.Source.Organism.Species = organism.Substring(index, organism.Length - index);
                    }
                }
                else
                {
                    metadata.Source.Organism.Genus = organism;
                }
            }

            metadata.Source.Organism.ClassLevels = classLevels;
        }
Example #17
0
        private static void ParseReferences(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankMetadata           metadata      = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];
            IList <CitationReference> referenceList = metadata.References;
            CitationReference         reference     = null;

            //List<MetadataListItem<string>> referenceList = new List<MetadataListItem<string>>();
            //MetadataListItem<string> reference = null;

            while (bioReader.HasLines)
            {
                if (bioReader.LineHeader == "REFERENCE")
                {
                    // add previous reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // check for start/end e.g. (bases 1 to 118), or prose notes
                    Match m = Regex.Match(bioReader.LineData,
                                          @"^(?<number>\d+)(\s+\((?<location>.*)\))?");
                    if (!m.Success)
                    {
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserReferenceError,
                            bioReader.LineData);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }

                    // create new reference
                    string number   = m.Groups["number"].Value;
                    string location = m.Groups["location"].Value;
                    reference = new CitationReference();
                    int outValue;
                    if (!int.TryParse(number, out outValue))
                    {
                        throw new InvalidOperationException();
                    }
                    reference.Number   = outValue;
                    reference.Location = location;
                    bioReader.GoToNextLine();
                }
                else if (bioReader.Line.StartsWith(" ", StringComparison.Ordinal))
                {
                    switch (bioReader.LineHeader)
                    {
                    // all the following are extracted the same way - possibly multiline
                    case "AUTHORS":
                        reference.Authors = ParseMultiLineData(bioReader, " ");
                        break;

                    case "CONSRTM":
                        reference.Consortiums = ParseMultiLineData(bioReader, " ");
                        break;

                    case "TITLE":
                        reference.Title = ParseMultiLineData(bioReader, " ");
                        break;

                    case "JOURNAL":
                        reference.Journal = ParseMultiLineData(bioReader, " ");
                        break;

                    case "REMARK":
                        reference.Remarks = ParseMultiLineData(bioReader, " ");
                        break;

                    case "MEDLINE":
                        reference.Medline = ParseMultiLineData(bioReader, " ");
                        break;

                    case "PUBMED":
                        reference.PubMed = ParseMultiLineData(bioReader, " ");
                        break;

                    default:
                        string message = String.Format(
                            CultureInfo.CurrentCulture,
                            Properties.Resource.ParserInvalidReferenceField,
                            bioReader.LineHeader);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    // add last reference
                    if (reference != null)
                    {
                        referenceList.Add(reference);
                    }

                    // don't go to next line; current line still needs to be processed
                    break;
                }
            }
        }
Example #18
0
        // LOCUS is the first line in a GenBank record
        private void ParseLocus(BioTextReader bioReader, ref Sequence sequence)
        {
            GenBankLocusInfo locusInfo = new GenBankLocusInfo();

            // GenBank spec recommends token rather than position-based parsing, but this
            // is only partially possible without making extra assumptions about the presence
            // of optional fields.
            string[] tokens = bioReader.LineData.Split(new char[] { ' ' },
                                                       StringSplitOptions.RemoveEmptyEntries);
            sequence.ID    = tokens[0];
            locusInfo.Name = tokens[0];

            int sequenceLength;

            if (!int.TryParse(tokens[1], out sequenceLength))
            {
                throw new InvalidOperationException();
            }
            locusInfo.SequenceLength = sequenceLength;

            string seqType = tokens[2];

            if (seqType != "bp" && seqType != "aa")
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            // Determine format version and parse the remaining fields by position.
            string strandType;
            string strandTopology;
            string division;
            string rawDate;
            string molType = string.Empty;

            if (Helper.StringHasMatch(bioReader.GetLineField(31, 32), "bp", "aa"))
            {
                // older format
                strandType     = bioReader.GetLineField(34, 36).Trim();
                strandTopology = bioReader.GetLineField(43, 52).Trim();
                division       = bioReader.GetLineField(53, 56).Trim();
                rawDate        = bioReader.GetLineField(63).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(37, 42).Trim();
                }
            }
            else
            {
                // newer format
                strandType     = bioReader.GetLineField(45, 47).Trim();
                strandTopology = bioReader.GetLineField(56, 63).Trim();
                division       = bioReader.GetLineField(65, 67).Trim();
                rawDate        = bioReader.GetLineField(69).Trim();

                // molecule type field is not used for amino acid chains
                if (seqType != "aa")
                {
                    molType = bioReader.GetLineField(48, 53).Trim();
                }
            }

            // process strand type
            if (!Helper.StringHasMatch(strandType, string.Empty, "ss-", "ds-", "ms-"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidLocus,
                    bioReader.Line);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }
            locusInfo.Strand = Helper.GetStrandType(strandType);

            // process strand topology
            if (!Helper.StringHasMatch(strandTopology, string.Empty, "linear", "circular"))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidStrand,
                    strandTopology);
                Trace.Report(message);
                throw new InvalidDataException(message);
            }

            locusInfo.StrandTopology = Helper.GetStrandTopology(strandTopology);

            // process division
            try
            {
                locusInfo.DivisionCode = (SequenceDivisionCode)Enum.Parse(typeof(SequenceDivisionCode), division);
            }
            catch (ArgumentException)
            {
                locusInfo.DivisionCode = SequenceDivisionCode.None;
            }

            // process date
            DateTime date;

            if (!DateTime.TryParse(rawDate, out date))
            {
                string message = String.Format(
                    CultureInfo.CurrentCulture,
                    Properties.Resource.ParserInvalidDate,
                    rawDate);
                Trace.Report(message);
                throw new FormatException(message);
            }

            locusInfo.Date         = date;
            locusInfo.SequenceType = seqType;

            // process sequence type and molecule type
            MoleculeType moleculeType;

            if (seqType == "aa")
            {
                moleculeType = MoleculeType.Protein;
            }
            else
            {
                moleculeType = GetMoleculeType(molType);

                if (moleculeType == MoleculeType.Invalid)
                {
                    string message = String.Format(
                        CultureInfo.CurrentCulture,
                        Properties.Resource.ParserInvalidLocus,
                        bioReader.Line);
                    Trace.Report(message);
                    throw new FormatException(message);
                }
            }

            IAlphabet alphabet = GetAlphabet(moleculeType);

            if (alphabet != sequence.Alphabet)
            {
                if (Alphabet != null && Alphabet != alphabet)
                {
                    string message = Properties.Resource.ParserIncorrectAlphabet;
                    Trace.Report(message);
                    throw new InvalidDataException(message);
                }
                sequence            = new Sequence(alphabet, Encoding, sequence);
                sequence.IsReadOnly = false;
            }

            sequence.MoleculeType  = moleculeType;
            locusInfo.MoleculeType = moleculeType;
            GenBankMetadata metadata = (GenBankMetadata)sequence.Metadata[Helper.GenBankMetadataKey];

            metadata.Locus = locusInfo;
            bioReader.GoToNextLine();
        }
Example #19
0
        /// <summary>
        /// Parses a single FASTA text from a reader into a sequence.
        /// </summary>
        /// <param name="bioReader">bio text reader</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new Sequence instance containing parsed data.</returns>
        protected ISequence ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = null;

            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            string message;

            if (!bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVAILD_INPUT_FILE,
                                        Resource.FASTA_NAME);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            Sequence sequence;
            string   id = bioReader.GetLineField(2).Trim();

            if (_blockSize > FileLoadHelper.DefaultFullLoadBlockSize)
            {
                _lineCount++;
                _lineLength    += bioReader.Line.Length;
                sequencePointer = new SequencePointer {
                    StartingLine = _lineCount
                };
            }

            bioReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                alphabet = _commonSequenceParser.IdentifyAlphabet(alphabet, bioReader.Line);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            bioReader.Line);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            if (Encoding == null)
            {
                sequence = new Sequence(alphabet);
            }
            else
            {
                sequence = new Sequence(alphabet, Encoding, string.Empty)
                {
                    IsReadOnly = false
                };
            }

            bool sameSequence = false;

            sequence.ID = id;
            while (bioReader.HasLines && !bioReader.Line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                if (Alphabet == null)
                {
                    alphabet = _commonSequenceParser.IdentifyAlphabet(sequence.Alphabet, bioReader.Line);

                    if (alphabet == null)
                    {
                        message = string.Format(CultureInfo.InvariantCulture,
                                                Resource.InvalidSymbolInString,
                                                bioReader.Line);
                        Trace.Report(message);
                        throw new FileFormatException(message);
                    }

                    if (sequence.Alphabet != alphabet)
                    {
                        Sequence seq = new Sequence(alphabet, Encoding, sequence)
                        {
                            IsReadOnly = false
                        };
                        sequence.Clear();
                        sequence = seq;
                    }
                }


                // full load
                if (_blockSize <= 0)
                {
                    sequence.InsertRange(sequence.Count, bioReader.Line);
                }
                else
                {
                    if (sameSequence == false)
                    {
                        _sequenceBeginsAt = _lineLength;
                        sameSequence      = true;
                    }

                    _lineLength += bioReader.Line.Length;
                    _lineCount++;
                }

                bioReader.GoToNextLine();
            }

            if (sequence.MoleculeType == MoleculeType.Invalid)
            {
                sequence.MoleculeType = CommonSequenceParser.GetMoleculeType(sequence.Alphabet);
            }
            sequence.IsReadOnly = isReadOnly;

            // full load
            if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize)
            {
                return(sequence);
            }

            if (sequencePointer != null)
            {
                sequencePointer.AlphabetName = sequence.Alphabet.Name;
                sequencePointer.Id           = sequence.ID;

                sequencePointer.StartingIndex = _sequenceBeginsAt;
                sequencePointer.EndingIndex   = _lineLength;
                _sequencePointers.Add(sequencePointer);
            }
            _sequenceCount++;
            FileVirtualSequenceProvider dataprovider = new FileVirtualSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualSequenceProvider = dataprovider;
            return(sequence);
        }
Example #20
0
        /// <summary>
        /// Parses a single FASTQ text from a reader into a QualitativeSequence.
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting QualitativeSequence should be in readonly mode or not.
        /// If this flag is set to true then the resulting QualitativeSequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.
        /// </param>
        /// <returns>A new QualitativeSequence instance containing parsed data.</returns>
        private IQualitativeSequence ParseOneWithFastQFormat(BioTextReader bioReader, bool isReadOnly)
        {
            SequencePointer sequencePointer = new SequencePointer();
            string          message         = string.Empty;

            // Check for '@' symbol at the first line.
            if (!bioReader.HasLines || !bioReader.Line.StartsWith("@", StringComparison.Ordinal))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Process header line.
            string id = bioReader.GetLineField(2).Trim();

            _numberOfCharactersParsed    += bioReader.Line.Length;
            sequencePointer.StartingIndex = _numberOfCharactersParsed;
            sequencePointer.StartingLine  = bioReader.LineNumber;

            // Go to second line.
            bioReader.GoToNextLine();
            if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Get sequence from second line.
            string sequenceLine = bioReader.Line;

            _numberOfCharactersParsed  += bioReader.Line.Length;
            sequencePointer.EndingIndex = _numberOfCharactersParsed;

            // Goto third line.
            bioReader.GoToNextLine();

            // Check for '+' symbol in the third line.
            if (!bioReader.HasLines || !bioReader.Line.StartsWith("+", StringComparison.Ordinal))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            _numberOfCharactersParsed += bioReader.Line.Length;

            string qualScoreId = bioReader.GetLineField(2).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoreHeaderData, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            // Goto fourth line.
            bioReader.GoToNextLine();
            if (!bioReader.HasLines || string.IsNullOrEmpty(bioReader.Line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            _numberOfCharactersParsed += bioReader.Line.Length;

            // Get the quality scores from the fourth line.
            byte[] qualScores = ASCIIEncoding.ASCII.GetBytes(bioReader.Line);

            // Check for sequence length and quality score length.
            if (sequenceLine.Length != bioReader.Line.Length)
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidQualityScoresLength, id);
                message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            bioReader.GoToNextLine();

            IAlphabet alphabet = Alphabet;

            // Identify alphabet if it is not specified.
            if (alphabet == null)
            {
                alphabet = IdentifyAlphabet(alphabet, sequenceLine);

                if (alphabet == null)
                {
                    string message1 = string.Format(CultureInfo.CurrentCulture, Resource.InvalidSymbolInString, sequenceLine);
                    message = string.Format(CultureInfo.CurrentCulture, Resource.IOFormatErrorMessage, Name, message1);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            FastQFormatType fastQType = FastqType;

            // Identify fastq format type if AutoDetectFastQFormat property is set to true.
            if (AutoDetectFastQFormat)
            {
                fastQType = IdentifyFastQFormatType(qualScores);
            }

            QualitativeSequence sequence = null;

            if (Encoding == null)
            {
                sequence = new QualitativeSequence(alphabet, fastQType, sequenceLine, qualScores);
            }
            else
            {
                sequence = new QualitativeSequence(alphabet, fastQType, Encoding, sequenceLine, qualScores);
            }

            sequence.ID         = id;
            sequence.IsReadOnly = isReadOnly;

            // full load
            if (_blockSize == FileLoadHelper.DefaultFullLoadBlockSize)
            {
                return(sequence);
            }

            sequencePointer.AlphabetName = sequence.Alphabet.Name;
            sequencePointer.Id           = sequence.ID;
            _sequencePointers.Add(sequencePointer);

            FileVirtualQualitativeSequenceProvider dataProvider = new FileVirtualQualitativeSequenceProvider(this, sequencePointer)
            {
                BlockSize         = _blockSize,
                MaxNumberOfBlocks = _maxNumberOfBlocks
            };

            sequence.VirtualQualitativeSequenceProvider = dataProvider;

            return(sequence);
        }
Example #21
0
        /// <summary>
        /// Parse the Sequence data in the block
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="IDs">List of sequence IDs</param>
        /// <returns>parse sequence in alignment</returns>
        private static Dictionary <string, string> ParseCharacterBlock(BioTextReader bioReader, IList <string> IDs)
        {
            bool   isInCharactersBlock = true;
            string data           = string.Empty;
            int    sequenceLength = 0;
            Dictionary <string, string> dataSet = new Dictionary <string, string>();

            while (bioReader.HasLines && isInCharactersBlock)
            {
                bioReader.GoToNextLine();
                IList <string> tokens = GetTokens(bioReader.Line);

                if (0 == string.Compare("DIMENSIONS", tokens[0], StringComparison.OrdinalIgnoreCase))
                {
                    tokens[0] = string.Empty;

                    // Parse dimensions
                    // 1. Length of sequence
                    do
                    {
                        foreach (string token in tokens)
                        {
                            data = token.Trim(new char[] { ';' });

                            if (string.IsNullOrEmpty(data))
                            {
                                continue;
                            }

                            if (data.StartsWith("nchar=", StringComparison.OrdinalIgnoreCase))
                            {
                                sequenceLength = Int32.Parse(data.Substring(6), CultureInfo.InvariantCulture);
                            }
                        }

                        if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            bioReader.GoToNextLine();
                            tokens = GetTokens(bioReader.Line);
                        }
                    }while (bioReader.HasLines);
                }
                else if (0 == string.Compare("FORMAT", tokens[0], StringComparison.OrdinalIgnoreCase))
                {
                    tokens[0] = string.Empty;

                    // Parse format
                    // 1. Notation for "missing"
                    // 2. Notation for "gap"
                    // 3. Notation for "matchchar"
                    // 4. data type
                    do
                    {
                        if (bioReader.Line.Trim().EndsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            break;
                        }
                        else
                        {
                            bioReader.GoToNextLine();
                            tokens = GetTokens(bioReader.Line);
                        }
                    }while (bioReader.HasLines);
                }
                if (0 == string.Compare("MATRIX", tokens[0], StringComparison.OrdinalIgnoreCase))
                {
                    tokens[0] = string.Empty;

                    // "If available" ignore the data in square brackets []
                    while (bioReader.HasLines)
                    {
                        if (bioReader.Line.StartsWith("[", StringComparison.OrdinalIgnoreCase))
                        {
                            bioReader.GoToNextLine();
                        }
                        else
                        {
                            break;
                        }
                    }

                    // Here are the alignment sequences
                    while (bioReader.HasLines)
                    {
                        bioReader.GoToNextLine();

                        if (string.IsNullOrEmpty(bioReader.Line.Trim()))
                        {
                            continue;
                        }

                        tokens = GetTokens(bioReader.Line);
                        if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase))
                        {
                            isInCharactersBlock = false;
                            break;
                        }

                        if (IDs.Contains(tokens[0]))
                        {
                            data = tokens[1];

                            if (dataSet.ContainsKey(tokens[0]))
                            {
                                data = string.Concat(dataSet[tokens[0]], data);
                            }

                            dataSet[tokens[0]] = data;
                        }
                    }
                }
                else if (tokens[0].StartsWith(";", StringComparison.OrdinalIgnoreCase))
                {
                    isInCharactersBlock = false;
                }
            }

            // Read the end line "end;"
            bioReader.GoToNextLine();

            // Validate the length of sequence
            foreach (string dataSequence in dataSet.Values)
            {
                if (dataSequence.Length != sequenceLength)
                {
                    throw new FormatException(Properties.Resource.SequenceLengthMismatch);
                }
            }

            return(dataSet);
        }
Example #22
0
        /// <summary>
        /// Parses a single ClustalW text from a reader into a sequence.
        /// </summary>
        /// <param name="bioReader">A reader for a biological sequence text.</param>
        /// <param name="isReadOnly">
        /// Flag to indicate whether the resulting sequence alignment should be in readonly mode or not.
        /// If this flag is set to true then the resulting sequence's isReadOnly property
        /// will be set to true, otherwise it will be set to false.</param>
        /// <returns>A new Sequence Alignment instance containing parsed data.</returns>
        protected ISequenceAlignment ParseOneWithSpecificFormat(BioTextReader bioReader, bool isReadOnly)
        {
            if (bioReader == null)
            {
                throw new ArgumentNullException("bioReader");
            }

            string message = string.Empty;

            if (!bioReader.Line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(CultureInfo.CurrentCulture, Resource.INVAILD_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            bioReader.GoToNextLine();  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            bioReader.SkipBlankLines = false;

            Dictionary <string, ISequence> mapIdToSequence = new Dictionary <string, ISequence>();
            IAlphabet alignmentAlphabet = null;
            bool      isFirstBlock      = true;
            bool      inBlock           = false;

            while (bioReader.HasLines)
            {
                // Blank line or consensus line signals end of block.
                if (String.IsNullOrEmpty(bioReader.Line) ||
                    Helper.ContainsOnly(bioReader.Line, '*', ' ', '.', '+', ':'))
                {
                    if (inBlock)
                    {
                        // Blank line signifies end of block
                        inBlock      = false;
                        isFirstBlock = false;
                    }
                }
                else // It's not a blank or consensus line.
                {
                    // It's a data line in a block.
                    // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore
                    string[]  tokens   = bioReader.Line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters
                    string    id       = tokens[0];
                    string    data     = tokens[1].ToUpper(CultureInfo.InvariantCulture);
                    Sequence  sequence = null;
                    IAlphabet alphabet = Alphabet;

                    inBlock = true;
                    if (isFirstBlock)
                    {
                        if (null == alphabet)
                        {
                            alphabet = _basicParser.IdentifyAlphabet(alphabet, data);

                            if (null == alphabet)
                            {
                                message = string.Format(
                                    CultureInfo.InvariantCulture,
                                    Resource.InvalidSymbolInString,
                                    data);
                                throw new InvalidDataException(message);
                            }
                            else
                            {
                                if (null == alignmentAlphabet)
                                {
                                    alignmentAlphabet = alphabet;
                                }
                                else
                                {
                                    if (alignmentAlphabet != alphabet)
                                    {
                                        message = string.Format(
                                            CultureInfo.CurrentCulture,
                                            Properties.Resource.SequenceAlphabetMismatch);
                                        throw new InvalidDataException(message);
                                    }
                                }
                            }
                        }

                        if (Encoding == null)
                        {
                            sequence = new Sequence(alphabet, data);
                        }
                        else
                        {
                            sequence = new Sequence(alphabet, Encoding, data);
                        }

                        sequence.ID         = id;
                        sequence.IsReadOnly = false;

                        mapIdToSequence.Add(id, sequence);
                    }
                    else
                    {
                        if (!mapIdToSequence.ContainsKey(id))
                        {
                            message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id);
                            throw new InvalidDataException(message);
                        }

                        sequence = (Sequence)mapIdToSequence[id];
                        sequence.InsertRange(sequence.Count, data);
                    }
                }

                bioReader.GoToNextLine();
            }

            SequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());
            foreach (Sequence alignmentSequence in mapIdToSequence.Values)
            {
                alignmentSequence.IsReadOnly = isReadOnly;
                sequenceAlignment.AlignedSequences[0].Sequences.Add(alignmentSequence);
            }

            return(sequenceAlignment);
        }