Ejemplo n.º 1
0
        /// <summary>
        /// Parses a single biological sequence alignment text from a stream.
        /// </summary>
        /// <param name="reader">Reader</param>
        /// <returns>Sequence</returns>
        private ISequenceAlignment ParseOne(StreamReader reader)
        {
            // no empty files allowed
            if (line == null)
                ReadNextLine(reader);

            if (line == null)
                throw new InvalidDataException(Properties.Resource.IONoTextToParse);

            if (!line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase))
            {
                throw new InvalidDataException(
                    string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name));
            }

            ReadNextLine(reader);  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            skipBlankLines = false;

            var mapIdToSequence = new Dictionary<string, Tuple<ISequence, List<byte>>>();
            IAlphabet alignmentAlphabet = null;
            bool isFirstBlock = true;
            bool inBlock = false;
            var endOfBlockSymbols = new HashSet<char> { '*', ' ', '.', '+', ':' };

            while (reader.Peek() != -1)
            {
                // Blank line or consensus line signals end of block.
                if (String.IsNullOrEmpty(line) || line.ToCharArray().All(endOfBlockSymbols.Contains))
                {
                    if (inBlock)
                    {
                        // Blank line signifies end of block
                        inBlock = false;
                        isFirstBlock = false;
                    }
                }
                else // It's not a blank or consensus line.
                {
                    // It's a data line in a block.
                    // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore
                    string[] tokens = line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters
                    string id = tokens[0];
                    string data = tokens[1].ToUpperInvariant();
                    byte[] byteData = Encoding.UTF8.GetBytes(data);
                    Tuple<ISequence, List<byte>> sequenceTuple;
                    IAlphabet alphabet = Alphabet;

                    inBlock = true;
                    if (isFirstBlock)
                    {
                        if (null == alphabet)
                        {
                            alphabet = Alphabets.AutoDetectAlphabet(byteData, 0, byteData.Length, alphabet);

                            if (null == alphabet)
                            {
                                throw new InvalidDataException(string.Format(
                                        CultureInfo.InvariantCulture,
                                        Properties.Resource.InvalidSymbolInString,
                                        data));
                            }
                            
                            if (null == alignmentAlphabet)
                            {
                                alignmentAlphabet = alphabet;
                            }
                            else
                            {
                                if (alignmentAlphabet != alphabet)
                                {
                                    throw new InvalidDataException(string.Format(
                                        CultureInfo.CurrentCulture,
                                        Properties.Resource.SequenceAlphabetMismatch));
                                }
                            }
                        }

                        sequenceTuple = new Tuple<ISequence, List<byte>>(
                            new Sequence(alphabet, "") { ID = id }, 
                            new List<byte>());
                        sequenceTuple.Item2.AddRange(byteData);

                        mapIdToSequence.Add(id, sequenceTuple);
                    }
                    else
                    {
                        if (!mapIdToSequence.ContainsKey(id))
                        {
                            throw new InvalidDataException(string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id));
                        }

                        sequenceTuple = mapIdToSequence[id];
                        sequenceTuple.Item2.AddRange(byteData);
                    }
                }

                ReadNextLine(reader);
            }

            var sequenceAlignment = new SequenceAlignment();
            var alignedSequence = new AlignedSequence();
            sequenceAlignment.AlignedSequences.Add(alignedSequence);
            foreach (var alignmentSequenceTuple in mapIdToSequence.Values)
            {
                alignedSequence.Sequences.Add(
                    new Sequence(alignmentSequenceTuple.Item1.Alphabet, alignmentSequenceTuple.Item2.ToArray()) 
                    { 
                        ID = alignmentSequenceTuple.Item1.ID 
                    });
            }

            return sequenceAlignment;
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Parses a single biological sequence alignment text from a reader.
        /// </summary>
        /// <param name="reader">A reader for a biological sequence alignment text.</param>
        /// <returns>The parsed ISequenceAlignment object.</returns>
        ISequenceAlignment ParseOne(TextReader reader)
        {
            ReadNextLine(reader);
            if (line == null)
            {
                throw new Exception(Properties.Resource.INVALID_INPUT_FILE);
            }

            this.ParseHeader(reader);

            var alignedSequence = new AlignedSequence();
            IList<string> ids = null;
            bool isInBlock = true;

            if (this.line.StartsWith("begin", StringComparison.OrdinalIgnoreCase))
            {
                while (this.line != null && isInBlock)
                {
                    if (string.IsNullOrEmpty(this.line.Trim()))
                    {
                        this.ReadNextLine(reader);
                        continue;
                    }

                    string blockName = GetTokens(this.line)[1];

                    switch (blockName.ToUpperInvariant())
                    {
                        case "TAXA":
                        case "TAXA;":
                            // This block contains the count of sequence & title of each sequence
                            ids = this.ParseTaxaBlock(reader);
                            break;

                        case "CHARACTERS":
                        case "CHARACTERS;":
                            // Block contains sequences
                            Dictionary<string, string> dataSet = this.ParseCharacterBlock(reader, ids);
                            IAlphabet alignmentAlphabet = null;

                            foreach (string id in ids)
                            {
                                IAlphabet alphabet = this.Alphabet;
                                string data = dataSet[id];

                                if (null == alphabet)
                                {
                                    byte[] dataArray = data.ToByteArray();
                                    alphabet = Alphabets.AutoDetectAlphabet(dataArray, 0, dataArray.Length, null);

                                    if (null == alphabet)
                                    {
                                        throw new InvalidDataException(string.Format(
                                            CultureInfo.InvariantCulture,
                                            Properties.Resource.InvalidSymbolInString,
                                            data));
                                    }
                                    
                                    if (null == alignmentAlphabet)
                                    {
                                        alignmentAlphabet = alphabet;
                                    }
                                    else
                                    {
                                        if (alignmentAlphabet != alphabet)
                                        {
                                            throw new InvalidDataException(string.Format(
                                                CultureInfo.InvariantCulture,
                                                Properties.Resource.SequenceAlphabetMismatch));
                                        }
                                    }
                                }

                                alignedSequence.Sequences.Add(new Sequence(alphabet, data) { ID = id });
                            }

                            break;

                        case "END":
                        case "END;":
                            // Have reached the end of block
                            isInBlock = false;
                            break;

                        default:
                            // skip this block
                            while (this.line != null)
                            {
                                this.ReadNextLine(reader);
                                if (0 == string.Compare(this.line, "end;", StringComparison.OrdinalIgnoreCase))
                                {
                                    break;
                                }
                            }
                            break;
                    }

                    this.ReadNextLine(reader);
                }
            }

            ISequenceAlignment sequenceAlignment = new SequenceAlignment();
            sequenceAlignment.AlignedSequences.Add(alignedSequence);
            return sequenceAlignment;
        }
Ejemplo n.º 3
0
        /// <summary>
        ///     Validate aligned sequence instance using different aligners
        /// </summary>
        /// <param name="nodeName">xml node name</param>
        /// <param name="aligner">sw/nw/pw aligners</param>
        private void ValidateAlignedSequenceCtor(string nodeName, ISequenceAligner aligner)
        {
            IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(nodeName,
                                                                                     Constants.AlphabetNameNode));
            string origSequence1 = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode1);
            string origSequence2 = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode2);

            // Create input sequences
            var inputSequences = new List<ISequence>();
            inputSequences.Add(new Sequence(alphabet, origSequence1));
            inputSequences.Add(new Sequence(alphabet, origSequence2));

            // Get aligned sequences
            IAlignedSequence alignedSequence = new AlignedSequence();
            IList<ISequenceAlignment> alignment = aligner.Align(inputSequences);

            // add aligned sequence and metadata information
            for (int iseq = 0; iseq < alignment[0].AlignedSequences[0].Sequences.Count; iseq++)
            {
                alignedSequence.Sequences.Add(alignment[0].AlignedSequences[0].Sequences[iseq]);
            }

            foreach (string key in alignment[0].AlignedSequences[0].Metadata.Keys)
            {
                alignedSequence.Metadata.Add(key, alignment[0].AlignedSequences[0].Metadata[key]);
            }

            // Validate the alignedsequence properties
            for (int index = 0; index < alignment[0].AlignedSequences[0].Sequences.Count; index++)
            {
                Assert.AreEqual(alignment[0].AlignedSequences[0].Sequences[index].ToString(),
                                alignedSequence.Sequences[index].ToString());
            }

            foreach (string key in alignment[0].AlignedSequences[0].Metadata.Keys)
            {
                Assert.AreEqual(alignment[0].AlignedSequences[0].Metadata[key],
                                alignedSequence.Metadata[key]);
            }

            ApplicationLog.WriteLine(@"Alignment BVT : Validation of aligned sequence ctor completed successfully");
        }