/// <summary> /// Parses a single biological sequence alignment text from a stream. /// </summary> /// <param name="reader">Reader</param> /// <returns>Sequence</returns> private ISequenceAlignment ParseOne(StreamReader reader) { // no empty files allowed if (line == null) ReadNextLine(reader); if (line == null) throw new InvalidDataException(Properties.Resource.IONoTextToParse); if (!line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase)) { throw new InvalidDataException( string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name)); } ReadNextLine(reader); // Skip blank lines until we get to the first block. // Now that we're at the first block, one or more blank lines are the block separators, which we'll need. skipBlankLines = false; var mapIdToSequence = new Dictionary<string, Tuple<ISequence, List<byte>>>(); IAlphabet alignmentAlphabet = null; bool isFirstBlock = true; bool inBlock = false; var endOfBlockSymbols = new HashSet<char> { '*', ' ', '.', '+', ':' }; while (reader.Peek() != -1) { // Blank line or consensus line signals end of block. if (String.IsNullOrEmpty(line) || line.ToCharArray().All(endOfBlockSymbols.Contains)) { if (inBlock) { // Blank line signifies end of block inBlock = false; isFirstBlock = false; } } else // It's not a blank or consensus line. { // It's a data line in a block. // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore string[] tokens = line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters string id = tokens[0]; string data = tokens[1].ToUpperInvariant(); byte[] byteData = Encoding.UTF8.GetBytes(data); Tuple<ISequence, List<byte>> sequenceTuple; IAlphabet alphabet = Alphabet; inBlock = true; if (isFirstBlock) { if (null == alphabet) { alphabet = Alphabets.AutoDetectAlphabet(byteData, 0, byteData.Length, alphabet); if (null == alphabet) { throw new InvalidDataException(string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, data)); } if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { throw new InvalidDataException(string.Format( CultureInfo.CurrentCulture, Properties.Resource.SequenceAlphabetMismatch)); } } } sequenceTuple = new Tuple<ISequence, List<byte>>( new Sequence(alphabet, "") { ID = id }, new List<byte>()); sequenceTuple.Item2.AddRange(byteData); mapIdToSequence.Add(id, sequenceTuple); } else { if (!mapIdToSequence.ContainsKey(id)) { throw new InvalidDataException(string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id)); } sequenceTuple = mapIdToSequence[id]; sequenceTuple.Item2.AddRange(byteData); } } ReadNextLine(reader); } var sequenceAlignment = new SequenceAlignment(); var alignedSequence = new AlignedSequence(); sequenceAlignment.AlignedSequences.Add(alignedSequence); foreach (var alignmentSequenceTuple in mapIdToSequence.Values) { alignedSequence.Sequences.Add( new Sequence(alignmentSequenceTuple.Item1.Alphabet, alignmentSequenceTuple.Item2.ToArray()) { ID = alignmentSequenceTuple.Item1.ID }); } return sequenceAlignment; }
/// <summary> /// Parses a single biological sequence alignment text from a reader. /// </summary> /// <param name="reader">A reader for a biological sequence alignment text.</param> /// <returns>The parsed ISequenceAlignment object.</returns> ISequenceAlignment ParseOne(TextReader reader) { ReadNextLine(reader); if (line == null) { throw new Exception(Properties.Resource.INVALID_INPUT_FILE); } this.ParseHeader(reader); var alignedSequence = new AlignedSequence(); IList<string> ids = null; bool isInBlock = true; if (this.line.StartsWith("begin", StringComparison.OrdinalIgnoreCase)) { while (this.line != null && isInBlock) { if (string.IsNullOrEmpty(this.line.Trim())) { this.ReadNextLine(reader); continue; } string blockName = GetTokens(this.line)[1]; switch (blockName.ToUpperInvariant()) { case "TAXA": case "TAXA;": // This block contains the count of sequence & title of each sequence ids = this.ParseTaxaBlock(reader); break; case "CHARACTERS": case "CHARACTERS;": // Block contains sequences Dictionary<string, string> dataSet = this.ParseCharacterBlock(reader, ids); IAlphabet alignmentAlphabet = null; foreach (string id in ids) { IAlphabet alphabet = this.Alphabet; string data = dataSet[id]; if (null == alphabet) { byte[] dataArray = data.ToByteArray(); alphabet = Alphabets.AutoDetectAlphabet(dataArray, 0, dataArray.Length, null); if (null == alphabet) { throw new InvalidDataException(string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, data)); } if (null == alignmentAlphabet) { alignmentAlphabet = alphabet; } else { if (alignmentAlphabet != alphabet) { throw new InvalidDataException(string.Format( CultureInfo.InvariantCulture, Properties.Resource.SequenceAlphabetMismatch)); } } } alignedSequence.Sequences.Add(new Sequence(alphabet, data) { ID = id }); } break; case "END": case "END;": // Have reached the end of block isInBlock = false; break; default: // skip this block while (this.line != null) { this.ReadNextLine(reader); if (0 == string.Compare(this.line, "end;", StringComparison.OrdinalIgnoreCase)) { break; } } break; } this.ReadNextLine(reader); } } ISequenceAlignment sequenceAlignment = new SequenceAlignment(); sequenceAlignment.AlignedSequences.Add(alignedSequence); return sequenceAlignment; }
/// <summary> /// Validate aligned sequence instance using different aligners /// </summary> /// <param name="nodeName">xml node name</param> /// <param name="aligner">sw/nw/pw aligners</param> private void ValidateAlignedSequenceCtor(string nodeName, ISequenceAligner aligner) { IAlphabet alphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.AlphabetNameNode)); string origSequence1 = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode1); string origSequence2 = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.SequenceNode2); // Create input sequences var inputSequences = new List<ISequence>(); inputSequences.Add(new Sequence(alphabet, origSequence1)); inputSequences.Add(new Sequence(alphabet, origSequence2)); // Get aligned sequences IAlignedSequence alignedSequence = new AlignedSequence(); IList<ISequenceAlignment> alignment = aligner.Align(inputSequences); // add aligned sequence and metadata information for (int iseq = 0; iseq < alignment[0].AlignedSequences[0].Sequences.Count; iseq++) { alignedSequence.Sequences.Add(alignment[0].AlignedSequences[0].Sequences[iseq]); } foreach (string key in alignment[0].AlignedSequences[0].Metadata.Keys) { alignedSequence.Metadata.Add(key, alignment[0].AlignedSequences[0].Metadata[key]); } // Validate the alignedsequence properties for (int index = 0; index < alignment[0].AlignedSequences[0].Sequences.Count; index++) { Assert.AreEqual(alignment[0].AlignedSequences[0].Sequences[index].ToString(), alignedSequence.Sequences[index].ToString()); } foreach (string key in alignment[0].AlignedSequences[0].Metadata.Keys) { Assert.AreEqual(alignment[0].AlignedSequences[0].Metadata[key], alignedSequence.Metadata[key]); } ApplicationLog.WriteLine(@"Alignment BVT : Validation of aligned sequence ctor completed successfully"); }