C# (CSharp) Alphabets.AutoDetectAlphabet 예제들

프로그래밍 언어: C# (CSharp)

클래스/타입: Alphabets

메소드/함수: AutoDetectAlphabet

hotexamples.com에서의 예제들: 22

C# (CSharp) Alphabets.AutoDetectAlphabet - 22개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 C# (CSharp)의 Alphabets.AutoDetectAlphabet에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

AutoDetectAlphabet(22)

CheckIsFromSameBase(11)

GetAmbiguousAlphabet(6)

GetAlphabet(6)

Contains(5)

Remove(3)

Load(3)

FirstOrDefault(3)

Add(2)

Get(2)

GenerateSubSet(1)

Generate(1)

GetHighestChar(1)

IndexOf(1)

Insert(1)

Count(1)

OrderBy(1)

Clear(1)

ToArray(1)

예제 #1

파일 보기

파일: FieldTextFileParser.cs 프로젝트: Szunyike/SAM-Statistic-2018

        /// <summary>
        ///     Parses one line from the text file.
        /// </summary>
        /// <param name="line"></param>
        /// <returns></returns>
        private ISequence ParseLine(string line)
        {
            string[] splitLine = line.Split(this.Delimiter);
            if (splitLine.Length != 2)
            {
                throw new Exception(string.Format(CultureInfo.InvariantCulture, Resource.INVALID_INPUT_FILE, line));
            }

            IAlphabet alphabet = this.Alphabet;

            if (alphabet == null)
            {
                byte[] byteArray = Encoding.UTF8.GetBytes(splitLine[1]);
                alphabet = Alphabets.AutoDetectAlphabet(byteArray, 0, byteArray.Length, null);
                if (alphabet == null)
                {
                    throw new Exception(string.Format(CultureInfo.InvariantCulture, Resource.InvalidSymbolInString, splitLine[1]));
                }
            }

            return(new Sequence(alphabet, splitLine[1])
            {
                ID = splitLine[0]
            });
        }

예제 #2

파일 보기

        /// <summary>
        /// Analyze the given sequences and store a consensus into its Consensus property.
        /// </summary>
        /// <param name="referenceSequence">Reference sequence.</param>
        /// <param name="querySequence">Query sequence.</param>
        /// <returns>Consensus of sequences.</returns>
        protected ISequence MakeConsensus(
            ISequence referenceSequence,
            ISequence querySequence)
        {
            if (referenceSequence == null)
            {
                throw new ArgumentNullException("referenceSequence");
            }

            if (querySequence == null)
            {
                throw new ArgumentNullException("querySequence");
            }

            // For each pair of symbols (characters) in reference and query sequence
            // get the consensus symbol and append it.
            byte[] consensus = new byte[referenceSequence.Count];
            for (int index = 0; index < referenceSequence.Count; index++)
            {
                consensus[index] = ConsensusResolver.GetConsensus(
                    new byte[] { referenceSequence[index], querySequence[index] });
            }

            IAlphabet alphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.LongLength, referenceSequence.Alphabet);

            return(new Sequence(alphabet, consensus, false));
        }

예제 #3

파일 보기

        /// <summary>
        /// Parses the GenBank Origin data from the GenBank file.
        /// </summary>
        /// <param name="line">parse line</param>
        /// <param name="metadata">The GenBank metadata.</param>
        /// <param name="stream">The stream reader.</param>
        private void ParseOrigin(ref string line, GenBankMetadata metadata, StreamReader stream)
        {
            // The origin line can contain optional data; don't put empty string into
            // metadata.
            string lineData = GetLineData(line, DataIndent);

            if (!String.IsNullOrEmpty(lineData))
            {
                metadata.Origin = lineData;
            }

            line = GoToNextLine(line, stream);
            IAlphabet alphabet = null;

            var sequenceBuilder = new StringBuilder();

            while ((line != null) && line[0] == ' ')
            {
                // Using a regex is too slow.
                int len = line.Length;
                int k   = 10;
                while (k < len)
                {
                    string seqData = line.Substring(k, Math.Min(10, len - k));

                    sequenceBuilder.Append(seqData);
                    k += 11;
                }

                line = GoToNextLine(line, stream);
            }

            var sequenceString = sequenceBuilder.ToString().Trim();

            if (!string.IsNullOrEmpty(sequenceString))
            {
                if (Alphabet == null)
                {
                    byte[] tempData = System.Text.Encoding.ASCII.GetBytes(sequenceString.ToUpper(CultureInfo.InvariantCulture));
                    alphabet = Alphabets.AutoDetectAlphabet(tempData, 0, tempData.Length, alphabet);

                    if (alphabet == null)
                    {
                        var message = String.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line);
                        Trace.Report(message);
                        throw new InvalidDataException(message);
                    }
                }
                else
                {
                    alphabet = Alphabet;
                }

                sequenceWithData = new Sequence(alphabet, sequenceString);
            }
        }

예제 #4

파일 보기

        /// <summary>
        /// Adds consensus to the alignment result.  At this point, it is a very simple algorithm
        /// which puts an ambiguity character where the two aligned sequences do not match.
        /// Uses X and N for protein and DNA/RNA alignments, respectively.
        /// </summary>
        /// <param name="alignment">
        /// Alignment to which to add the consensus.  This is the result returned by the main Align
        /// or AlignSimple method, which contains the aligned sequences but not yet a consensus sequence.
        /// </param>
        private void AddSimpleConsensusToResult(PairwiseAlignedSequence alignment)
        {
            ISequence seq0 = alignment.FirstSequence;
            ISequence seq1 = alignment.SecondSequence;

            byte[] consensus = new byte[seq0.Count];
            for (int i = 0; i < seq0.Count; i++)
            {
                consensus[i] = ConsensusResolver.GetConsensus(
                    new byte[] { seq0[i], seq1[i] });
            }

            IAlphabet consensusAlphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.GetLongLength(), seq0.Alphabet);

            alignment.Consensus = new Sequence(consensusAlphabet, consensus, false);
        }

예제 #5

파일 보기

        public void ValidateAutoDetectAlphabet()
        {
            string alphabetName = utilityObj.xmlUtil.GetTextValue(
                Constants.DnaDerivedSequenceNode, Constants.AlphabetNameNode);
            string dnaSequence = utilityObj.xmlUtil.GetTextValue(
                Constants.DnaDerivedSequenceNode, Constants.ExpectedDerivedSequence);

            byte[] dnaArray = encodingObj.GetBytes(dnaSequence);

            //Validating for Dna.
            IAlphabet dnaAplhabet = Alphabets.AutoDetectAlphabet(dnaArray, 0, 4, null);

            Assert.AreEqual(dnaAplhabet.Name, alphabetName);
            ApplicationLog.WriteLine(string.Concat(
                                         "Alphabets BVT: Validation of Auto Detect method for Dna completed successfully."));

            //Validating for Rna.
            alphabetName = "";
            alphabetName = utilityObj.xmlUtil.GetTextValue(
                Constants.RnaDerivedSequenceNode, Constants.AlphabetNameNode);
            string rnaSequence = utilityObj.xmlUtil.GetTextValue(
                Constants.RnaDerivedSequenceNode, Constants.ExpectedDerivedSequence);

            byte[] rnaArray = encodingObj.GetBytes(rnaSequence);

            IAlphabet rnaAplhabet = Alphabets.AutoDetectAlphabet(rnaArray, 0, 4, null);

            Assert.AreEqual(rnaAplhabet.Name, alphabetName);
            ApplicationLog.WriteLine(string.Concat(
                                         "Alphabets BVT: Validation of Auto Detect method for Rna completed successfully."));

            //Validating for Protein.
            alphabetName = "";
            alphabetName = utilityObj.xmlUtil.GetTextValue(
                Constants.ProteinDerivedSequenceNode, Constants.AlphabetNameNode);
            string proteinSequence = utilityObj.xmlUtil.GetTextValue(
                Constants.ProteinDerivedSequenceNode, Constants.ExpectedDerivedSequence);

            byte[]    proteinArray    = encodingObj.GetBytes(proteinSequence);
            IAlphabet proteinAplhabet = Alphabets.AutoDetectAlphabet(proteinArray, 0, 4, null);

            Assert.AreEqual(proteinAplhabet.Name, alphabetName);
            ApplicationLog.WriteLine(string.Concat(
                                         "Alphabets BVT: Validation of Auto Detect method for Protein completed successfully."));
        }

예제 #6

파일 보기

        private ISequence ParseLine(string line)
        {
            string[] splitLine = line.Split(Delimiter);
            string   message;

            if (splitLine.Length != 2)
            {
                message = string.Format(CultureInfo.InvariantCulture,
                                        Resource.INVALID_INPUT_FILE,
                                        line);
                Trace.Report(message);
                throw new FileFormatException(message);
            }

            IAlphabet alphabet = Alphabet;

            if (alphabet == null)
            {
                byte[] byteArray = UTF8Encoding.UTF8.GetBytes(splitLine[1]);
                alphabet = Alphabets.AutoDetectAlphabet(byteArray, 0, byteArray.Length, null);

                if (alphabet == null)
                {
                    message = string.Format(CultureInfo.InvariantCulture,
                                            Resource.InvalidSymbolInString,
                                            splitLine[1]);
                    Trace.Report(message);
                    throw new FileFormatException(message);
                }
            }

            Sequence sequence;

            sequence = new Sequence(alphabet, splitLine[1])
            {
                ID = splitLine[0]
            };

            return(sequence);
        }

예제 #7

파일 보기

        /// <summary>
        /// Parses a single sequence from the file.
        /// </summary>
        /// <param name="header">Parsed header</param>
        /// <param name="reader">Binary reader</param>
        /// <returns>Sequence</returns>
        private ISequence ParseOne(SffHeader header, BinaryReader reader)
        {
            // Parse out the read header.
            ushort headerLength  = C2BE(reader.ReadUInt16());
            ushort nameLength    = C2BE(reader.ReadUInt16());
            uint   numberOfBases = C2BE(reader.ReadUInt32());

            // TODO: use clipping data
            ushort clipQualityLeft  = C2BE(reader.ReadUInt16());
            ushort clipQualityRight = C2BE(reader.ReadUInt16());
            ushort clipAdapterLeft  = C2BE(reader.ReadUInt16());
            ushort clipAdapterRight = C2BE(reader.ReadUInt16());

            string name = new string(reader.ReadChars(nameLength));

            long paddingSize = headerLength - (16 + nameLength);

            if (paddingSize < 0 || paddingSize > 8)
            {
                throw new Exception("Invalid read header size found.");
            }
            if (paddingSize > 0)
            {
                if (reader.Read(new char[8], 0, (int)paddingSize) != paddingSize)
                {
                    throw new Exception("Could not parse read header (padding).");
                }
            }

            // Parse out the read data section
            ushort[] flowgramValues = new ushort[header.NumberOfFlowsPerRead];
            for (int flowCount = 0; flowCount < header.NumberOfFlowsPerRead; flowCount++)
            {
                flowgramValues[flowCount] = C2BE(reader.ReadUInt16());
            }

            byte[] flowIndexPerBase = new byte[numberOfBases];
            if (reader.Read(flowIndexPerBase, 0, (int)numberOfBases) != numberOfBases)
            {
                throw new Exception("Unable to read flow indexes.");
            }

            byte[] bases = new byte[numberOfBases];
            if (reader.Read(bases, 0, (int)numberOfBases) != numberOfBases)
            {
                throw new Exception("Unable to read base information.");
            }

            byte[] qscores = new byte[numberOfBases];
            if (reader.Read(qscores, 0, (int)numberOfBases) != numberOfBases)
            {
                throw new Exception("Unable to read quality scores.");
            }
            for (int i = 0; i < qscores.Length; i++)
            {
                qscores[i] += 33; // adjust for Sanger
            }
            // Adjust for 8-byte padding at end of read segment
            long currentSize = header.NumberOfFlowsPerRead * 2 + 3 * numberOfBases;

            if ((currentSize & 7) > 0)
            {
                paddingSize = (((currentSize >> 3) + 1) << 3) - currentSize;
                if (paddingSize < 0 || paddingSize > 8)
                {
                    throw new Exception("Invalid read data size found.");
                }
                if (paddingSize > 0)
                {
                    if (reader.Read(new char[8], 0, (int)paddingSize) != paddingSize)
                    {
                        throw new Exception("Could not parse read header (padding).");
                    }
                }
            }

            // Determine the alphabet.
            var alphabet = Alphabet
                           ?? Alphabets.AutoDetectAlphabet(bases, 0, bases.Length, null)
                           ?? Alphabets.AmbiguousDNA;

            // Return our qSequence
            return(new QualitativeSequence(alphabet, FastQFormatType.Sanger, bases, qscores, false)
            {
                ID = name
            });
        }

예제 #8

파일 보기

파일: BAMSequenceParser.cs 프로젝트: evolvedmicrobe/MTAssembler

        public IEnumerable <CompactSAMSequence> Parse()
        {
            if (string.IsNullOrWhiteSpace(_fileName))
            {
                throw new ArgumentNullException("fileName");
            }
            using (readStream = new FileStream(_fileName, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                Stream reader = readStream;
                if (reader == null || reader.Length == 0)
                {
                    throw new FileFormatException(Properties.Resource.BAM_InvalidBAMFile);
                }
                if (!String.IsNullOrEmpty(ChromosomeToGet))
                {
                    foreach (var s in
                             ParseRangeAsEnumerableSequences(_fileName, ChromosomeToGet))
                    {
                        if (s != null)
                        {
                            yield return(s);
                        }
                        ////TODO: Super inefficient right now, am parsing the sequence multiple times,
                        ////fix this.
                        //var s2 = s.ToArray ();
                        //var alpha = Alphabets.AutoDetectAlphabet(s2, 0, s2.Length, null);


                        //var strippedOfInfo = new Sequence(alpha, s2);
                        //yield return strippedOfInfo;
                    }
                }
                else
                {
                    readStream = reader;
                    ValidateReader();
                    SAMAlignmentHeader   header = GetHeader();
                    SequenceAlignmentMap sequenceAlignmentMap = null;
                    if (sequenceAlignmentMap == null)
                    {
                        sequenceAlignmentMap = new SequenceAlignmentMap(header);
                    }

                    while (!IsEOF())
                    {
#if WANT_OLD_VERSION
                        SAMAlignedSequence alignedSeq = GetAlignedSequence(0, int.MaxValue);
#else
                        var alignedSeq = GetAlignedSequence();
#endif
                        if (alignedSeq != null)
                        {
#if WANT_OLD_VERSION
                            //make a new Sequence
                            ISequence strippedOfInfo = null;
                            try
                            {
                                var syms  = alignedSeq.QuerySequence.ToArray();
                                var alpha = Alphabets.AutoDetectAlphabet(syms, 0, syms.Length, null);
                                strippedOfInfo = new Sequence(alpha, alignedSeq.QuerySequence.ToArray());

                                strippedOfInfo = alignedSeq;
                            }
                            catch (ArgumentOutOfRangeException exception)
                            {
                                Debug.Write("Could not convert sequence: " + exception.Message);
                            }
                            if (strippedOfInfo != null)
                            {
                                yield return(strippedOfInfo);
                            }
#else
                            yield return(alignedSeq);
                                                        #endif
                        }
                        alignedSeq = null;
                    }
                }
            }
        }

예제 #9

파일 보기

파일: ClustalWParser.cs 프로젝트: Szunyike/SAM-Statistic-2018

        /// <summary>
        /// Parses a single biological sequence alignment text from a stream.
        /// </summary>
        /// <param name="reader">Reader</param>
        /// <returns>Sequence</returns>
        private ISequenceAlignment ParseOne(StreamReader reader)
        {
            // no empty files allowed
            if (line == null)
            {
                ReadNextLine(reader);
            }

            if (line == null)
            {
                throw new InvalidDataException(Properties.Resource.IONoTextToParse);
            }

            if (!line.StartsWith("CLUSTAL", StringComparison.OrdinalIgnoreCase))
            {
                throw new InvalidDataException(
                          string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name));
            }

            ReadNextLine(reader);  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            skipBlankLines = false;

            var       mapIdToSequence   = new Dictionary <string, Tuple <ISequence, List <byte> > >();
            IAlphabet alignmentAlphabet = null;
            bool      isFirstBlock      = true;
            bool      inBlock           = false;
            var       endOfBlockSymbols = new HashSet <char> {
                '*', ' ', '.', '+', ':'
            };

            while (reader.Peek() != -1)
            {
                // Blank line or consensus line signals end of block.
                if (String.IsNullOrEmpty(line) || line.ToCharArray().All(endOfBlockSymbols.Contains))
                {
                    if (inBlock)
                    {
                        // Blank line signifies end of block
                        inBlock      = false;
                        isFirstBlock = false;
                    }
                }
                else // It's not a blank or consensus line.
                {
                    // It's a data line in a block.
                    // Lines begin with sequence id, then the sequence segment, and optionally a number, which we will ignore
                    string[] tokens   = line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries); // (char[])null uses whitespace delimiters
                    string   id       = tokens[0];
                    string   data     = tokens[1].ToUpperInvariant();
                    byte[]   byteData = Encoding.UTF8.GetBytes(data);
                    Tuple <ISequence, List <byte> > sequenceTuple;
                    IAlphabet alphabet = Alphabet;

                    inBlock = true;
                    if (isFirstBlock)
                    {
                        if (null == alphabet)
                        {
                            alphabet = Alphabets.AutoDetectAlphabet(byteData, 0, byteData.Length, alphabet);

                            if (null == alphabet)
                            {
                                throw new InvalidDataException(string.Format(
                                                                   CultureInfo.InvariantCulture,
                                                                   Properties.Resource.InvalidSymbolInString,
                                                                   data));
                            }

                            if (null == alignmentAlphabet)
                            {
                                alignmentAlphabet = alphabet;
                            }
                            else
                            {
                                if (alignmentAlphabet != alphabet)
                                {
                                    throw new InvalidDataException(string.Format(
                                                                       CultureInfo.CurrentCulture,
                                                                       Properties.Resource.SequenceAlphabetMismatch));
                                }
                            }
                        }

                        sequenceTuple = new Tuple <ISequence, List <byte> >(
                            new Sequence(alphabet, "")
                        {
                            ID = id
                        },
                            new List <byte>());
                        sequenceTuple.Item2.AddRange(byteData);

                        mapIdToSequence.Add(id, sequenceTuple);
                    }
                    else
                    {
                        if (!mapIdToSequence.ContainsKey(id))
                        {
                            throw new InvalidDataException(string.Format(CultureInfo.CurrentCulture, Properties.Resource.ClustalUnknownSequence, id));
                        }

                        sequenceTuple = mapIdToSequence[id];
                        sequenceTuple.Item2.AddRange(byteData);
                    }
                }

                ReadNextLine(reader);
            }

            var sequenceAlignment = new SequenceAlignment();
            var alignedSequence   = new AlignedSequence();

            sequenceAlignment.AlignedSequences.Add(alignedSequence);
            foreach (var alignmentSequenceTuple in mapIdToSequence.Values)
            {
                alignedSequence.Sequences.Add(
                    new Sequence(alignmentSequenceTuple.Item1.Alphabet, alignmentSequenceTuple.Item2.ToArray())
                {
                    ID = alignmentSequenceTuple.Item1.ID
                });
            }

            return(sequenceAlignment);
        }

예제 #10

파일 보기

파일: FastQParser.cs 프로젝트: slogen/bio

        /// <summary>
        ///     Returns a single QualitativeSequence from the FASTQ data.
        /// </summary>
        /// <param name="reader">Reader to be parsed.</param>
        /// <param name="formatType">FASTQ format type.</param>
        /// <returns>Returns a QualitativeSequence.</returns>
        private IQualitativeSequence ParseOne(StreamReader reader, FastQFormatType formatType)
        {
            if (reader.EndOfStream)
            {
                return(null);
            }

            string line = ReadNextLine(reader, true);

            if (line == null || !line.StartsWith("@", StringComparison.Ordinal))
            {
                string message = string.Format(CultureInfo.CurrentCulture, Resource.INVALID_INPUT_FILE, this.Name);
                throw new Exception(message);
            }

            // Process header line.
            string id = line.Substring(1).Trim();

            line = ReadNextLine(reader, true);
            if (string.IsNullOrEmpty(line))
            {
                string details = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_InvalidSequenceLine, id);
                string message = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.IOFormatErrorMessage,
                    this.Name,
                    details);
                throw new Exception(message);
            }

            // Get sequence from second line.
            byte[] sequenceData = Encoding.ASCII.GetBytes(line);

            // Goto third line.
            line = ReadNextLine(reader, true);

            // Check for '+' symbol in the third line.
            if (line == null || !line.StartsWith("+", StringComparison.Ordinal))
            {
                string details = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.FastQ_InvalidQualityScoreHeaderLine,
                    id);
                string message = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.IOFormatErrorMessage,
                    this.Name,
                    details);
                throw new Exception(message);
            }

            string qualScoreId = line.Substring(1).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string details = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.FastQ_InvalidQualityScoreHeaderData,
                    id);
                string message = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.IOFormatErrorMessage,
                    this.Name,
                    details);
                throw new Exception(message);
            }

            // Goto fourth line.
            line = ReadNextLine(reader, true);

            if (string.IsNullOrEmpty(line))
            {
                string details = string.Format(CultureInfo.CurrentCulture, Resource.FastQ_EmptyQualityScoreLine, id);
                string message = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.IOFormatErrorMessage,
                    this.Name,
                    details);
                throw new Exception(message);
            }

            // Get the quality scores from the fourth line.
            byte[] qualScores = Encoding.ASCII.GetBytes(line);

            // Check for sequence length and quality score length.
            if (sequenceData.GetLongLength() != qualScores.GetLongLength())
            {
                string details = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.FastQ_InvalidQualityScoresLength,
                    id);
                string message = string.Format(
                    CultureInfo.CurrentCulture,
                    Resource.IOFormatErrorMessage,
                    this.Name,
                    details);
                throw new Exception(message);
            }

            // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet
            IAlphabet alphabet = this.Alphabet;

            if (alphabet == null)
            {
                alphabet = Alphabets.AutoDetectAlphabet(sequenceData, 0, sequenceData.GetLongLength(), alphabet);
                if (alphabet == null)
                {
                    throw new Exception(Resource.CouldNotIdentifyAlphabetType);
                }
            }
            else
            {
                if (!alphabet.ValidateSequence(sequenceData, 0, sequenceData.GetLongLength()))
                {
                    throw new Exception(Resource.InvalidAlphabetType);
                }
            }

            return(new QualitativeSequence(alphabet, formatType, sequenceData, qualScores, false)
            {
                ID = id
            });
        }

예제 #11

파일 보기

파일: BAMSequenceParser.cs 프로젝트: evolvedmicrobe/MTAssembler

        protected CompactSAMSequence GetAlignedSequence()
        {
            byte[] array = new byte[4];
            ReadUnCompressedData(array, 0, 4);
            int blockLen = Helper.GetInt32(array, 0);

            byte[] alignmentBlock = new byte[blockLen];
            ReadUnCompressedData(alignmentBlock, 0, blockLen);
            int    value;
            UInt32 UnsignedValue;
            // 0-4 bytes
            int    refSeqIndex = Helper.GetInt32(alignmentBlock, 0);
            string RName;

            if (refSeqIndex == -1)
            {
                RName = "*";
            }
            else
            {
                RName = refSeqNames[refSeqIndex];
            }

            // 4-8 bytes
            int Pos = Helper.GetInt32(alignmentBlock, 4) + 1;

            // 8 - 12 bytes "bin<<16|mapQual<<8|read_name_len"
            UnsignedValue = Helper.GetUInt32(alignmentBlock, 8);
            int queryNameLen = (int)(UnsignedValue & 0x000000FF);

            // 12 - 16 bytes
            UnsignedValue = Helper.GetUInt32(alignmentBlock, 12);
            int flagValue = (int)(UnsignedValue & 0xFFFF0000) >> 16;
            int cigarLen  = (int)(UnsignedValue & 0x0000FFFF);

            //// 16-20 bytes
            int readLen = Helper.GetInt32(alignmentBlock, 16);

            // 32-(32+readLen) bytes
            string        name       = System.Text.ASCIIEncoding.ASCII.GetString(alignmentBlock, 32, queryNameLen - 1);
            StringBuilder strbuilder = new StringBuilder();
            int           startIndex = 32 + queryNameLen;

            for (int i = startIndex; i < (startIndex + cigarLen * 4); i += 4)
            {
                // Get the CIGAR operation length stored in first 28 bits.
                UInt32 cigarValue = Helper.GetUInt32(alignmentBlock, i);
                strbuilder.Append(((cigarValue & 0xFFFFFFF0) >> 4).ToString(CultureInfo.InvariantCulture));

                // Get the CIGAR operation stored in last 4 bits.
                value = (int)cigarValue & 0x0000000F;

                // MIDNSHP=>0123456
                switch (value)
                {
                case 0:
                    strbuilder.Append("M");
                    break;

                case 1:
                    strbuilder.Append("I");
                    break;

                case 2:
                    strbuilder.Append("D");
                    break;

                case 3:
                    strbuilder.Append("N");
                    break;

                case 4:
                    strbuilder.Append("S");
                    break;

                case 5:
                    strbuilder.Append("H");
                    break;

                case 6:
                    strbuilder.Append("P");
                    break;

                case 7:
                    strbuilder.Append("=");
                    break;

                case 8:
                    strbuilder.Append("X");
                    break;

                default:
                    throw new FileFormatException(Properties.Resource.BAM_InvalidCIGAR);
                }
            }

            string cigar = strbuilder.ToString();

            if (string.IsNullOrWhiteSpace(cigar))
            {
                cigar = "*";
            }
            startIndex += cigarLen * 4;
            //strbuilder = new StringBuilder();
            byte[] seqData      = new byte[readLen];
            int    seqDataIndex = 0;
            int    index        = startIndex;

            for (; index < (startIndex + (readLen + 1) / 2) - 1; index++)
            {
                // Get first 4 bit value
                value = (alignmentBlock[index] & 0xF0) >> 4;
                //strbuilder.Append(GetSeqChar(value));
                seqData[seqDataIndex++] = GetSeqCharAsByte(value);
                // Get last 4 bit value
                value = alignmentBlock[index] & 0x0F;
                //strbuilder.Append(GetSeqChar(value));
                seqData[seqDataIndex++] = GetSeqCharAsByte(value);
            }
            value = (alignmentBlock[index] & 0xF0) >> 4;
            //strbuilder.Append(GetSeqChar(value));
            seqData[seqDataIndex++] = GetSeqCharAsByte(value);
            if (readLen % 2 == 0)
            {
                value = alignmentBlock[index] & 0x0F;
                //strbuilder.Append(GetSeqChar(value));
                seqData[seqDataIndex++] = GetSeqCharAsByte(value);
            }
            startIndex = index + 1;
            // string strSequence = strbuilder.ToString();
            //Insert qual value catch here?  ADDING NEW QUALITY SCORE FINDER!!!
            byte[] qualValues    = new byte[readLen];
            string strQualValues = "*";

            if (alignmentBlock[startIndex] != 0xFF)
            {
                for (int i = startIndex; i < (startIndex + readLen); i++)
                {
                    qualValues[i - startIndex] = (byte)(alignmentBlock[i] + 33);
                }

                strQualValues = System.Text.ASCIIEncoding.ASCII.GetString(qualValues);
            }
            //END NEW EDITION!

            //var syms = Encoding.UTF8.GetBytes(strSequence);
            var alpha = Alphabets.AutoDetectAlphabet(seqData, 0, seqData.Length, null);
            //Sequence toReturn = new Sequence(alpha, syms);
            //TODO: Possibly a bit unsafe here
            var toReturn = new CompactSAMSequence(alpha, FastQFormatType.GATK_Recalibrated, seqData, qualValues, false);

            toReturn.ID       = name;
            toReturn.Pos      = Pos;
            toReturn.CIGAR    = cigar;
            toReturn.RName    = RName;
            toReturn.SAMFlags = (SAMFlags)flagValue;
            return(toReturn);
        }

예제 #12

파일 보기

        /// <summary>
        /// Parses a single biological sequence alignment text from a reader.
        /// </summary>
        /// <param name="reader">A reader for a biological sequence alignment text.</param>
        /// <returns>The parsed ISequenceAlignment object.</returns>
        public ISequenceAlignment ParseOne(TextReader reader)
        {
            string message = string.Empty;

            if (reader == null)
            {
                throw new ArgumentNullException("reader");
            }

            if (line == null)
            {
                ReadNextLine(reader);
            }

            // no empty files allowed
            if (line == null)
            {
                throw new InvalidDataException(Properties.Resource.IONoTextToParse);
            }

            // Parse first line
            IList <string> tokens = line.Split((char[])null, StringSplitOptions.RemoveEmptyEntries);

            if (2 != tokens.Count)
            {
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name);
                throw new InvalidDataException(message);
            }

            bool isFirstBlock   = true;
            int  sequenceCount  = 0;
            int  sequenceLength = 0;
            IList <Tuple <Sequence, List <byte> > > data = new List <Tuple <Sequence, List <byte> > >();
            string id             = string.Empty;
            string sequenceString = string.Empty;
            Tuple <Sequence, List <byte> > sequence = null;
            IAlphabet alignmentAlphabet             = null;

            sequenceCount  = Int32.Parse(tokens[0], CultureInfo.InvariantCulture);
            sequenceLength = Int32.Parse(tokens[1], CultureInfo.InvariantCulture);

            ReadNextLine(reader);  // Skip blank lines until we get to the first block.

            // Now that we're at the first block, one or more blank lines are the block separators, which we'll need.
            skipBlankLines = false;

            while (reader.Peek() != -1)
            {
                if (string.IsNullOrWhiteSpace(line))
                {
                    ReadNextLine(reader);
                    continue;
                }

                for (int index = 0; index < sequenceCount; index++)
                {
                    if (isFirstBlock)
                    {
                        // First 10 characters are sequence ID, remaining is the first block of sequence
                        // Note that both may contain whitespace, and there may be no whitespace between them.
                        if (line.Length <= 10)
                        {
                            message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name);
                            throw new Exception(message);
                        }
                        id             = line.Substring(0, 10).Trim();
                        sequenceString = line.Substring(10).Replace(" ", "");
                        byte[] sequenceBytes = System.Text.ASCIIEncoding.ASCII.GetBytes(sequenceString);

                        IAlphabet alphabet = Alphabet;
                        if (null == alphabet)
                        {
                            alphabet = Alphabets.AutoDetectAlphabet(sequenceBytes, 0, sequenceBytes.Length, alphabet);

                            if (null == alphabet)
                            {
                                message = string.Format(
                                    CultureInfo.InvariantCulture,
                                    Properties.Resource.InvalidSymbolInString,
                                    sequenceString);
                                throw new InvalidDataException(message);
                            }
                            else
                            {
                                if (null == alignmentAlphabet)
                                {
                                    alignmentAlphabet = alphabet;
                                }
                                else
                                {
                                    if (alignmentAlphabet != alphabet)
                                    {
                                        throw new InvalidDataException(Properties.Resource.SequenceAlphabetMismatch);
                                    }
                                }
                            }
                        }

                        Tuple <Sequence, List <byte> > sequenceStore = new Tuple <Sequence, List <byte> >(
                            new Sequence(alphabet, string.Empty)
                        {
                            ID = id
                        },
                            new List <byte>());

                        sequenceStore.Item2.AddRange(sequenceBytes);
                        data.Add(sequenceStore);
                    }
                    else
                    {
                        sequence = data[index];
                        byte[] sequenceBytes = System.Text.ASCIIEncoding.ASCII.GetBytes(line.Replace(" ", ""));
                        sequence.Item2.AddRange(sequenceBytes);
                    }

                    ReadNextLine(reader);
                }

                // Reset the first block flag
                isFirstBlock = false;
            }

            // Validate for the count of sequence
            if (sequenceCount != data.Count)
            {
                throw new InvalidDataException(Properties.Resource.SequenceCountMismatch);
            }

            SequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(new AlignedSequence());

            foreach (var dataSequence in data)
            {
                // Validate for the count of sequence
                if (sequenceLength != dataSequence.Item2.Count)
                {
                    throw new InvalidDataException(Properties.Resource.SequenceLengthMismatch);
                }

                sequenceAlignment.AlignedSequences[0].Sequences.Add(
                    new Sequence(dataSequence.Item1.Alphabet, dataSequence.Item2.ToArray())
                {
                    ID = dataSequence.Item1.ID
                });
            }

            return(sequenceAlignment);
        }

예제 #13

파일 보기

파일: FastAParser.cs 프로젝트: sjmercer65/bio

        /// <summary>
        /// Returns an IEnumerable of sequences in the stream being parsed.
        /// </summary>
        /// <param name="reader">Stream to parse.</param>
        /// <param name="buffer">Buffer to use.</param>
        /// <returns>Returns a Sequence.</returns>
        ISequence ParseOne(TextReader reader, byte[] buffer)
        {
            if (reader == null)
            {
                throw new ArgumentNullException("reader");
            }

            if (reader.Peek() == -1)
            {
                return(null);
            }

            int currentBufferSize = PlatformManager.Services.DefaultBufferSize;

            string message;
            string line = reader.ReadLine();

            // Continue reading if blank line found.
            while (line != null && string.IsNullOrEmpty(line))
            {
                line = reader.ReadLine();
            }

            if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
            {
                message = string.Format(
                    CultureInfo.InvariantCulture,
                    Properties.Resource.INVALID_INPUT_FILE,
                    Properties.Resource.FASTA_NAME);

                throw new Exception(message);
            }

            string name           = line.Substring(1);
            int    bufferPosition = 0;

            // Read next line.
            line = reader.ReadLine();

            // Continue reading if blank line found.
            while (line != null && string.IsNullOrEmpty(line))
            {
                line = reader.ReadLine();
            }

            if (line == null)
            {
                message = string.Format(
                    CultureInfo.InvariantCulture,
                    Properties.Resource.InvalidSymbolInString,
                    string.Empty);
                throw new Exception(message);
            }

            IAlphabet alphabet = Alphabet;
            bool      tryAutoDetectAlphabet = alphabet == null;

            do
            {
                // Files > 2G are not supported in this release.
                if ((((long)bufferPosition + line.Length) >= PlatformManager.Services.MaxSequenceSize))
                {
                    throw new ArgumentOutOfRangeException(
                              string.Format(CultureInfo.CurrentUICulture, Properties.Resource.SequenceDataGreaterthan2GB, name));
                }
                int neededSize = bufferPosition + line.Length;
                if (neededSize >= currentBufferSize)
                {
                    //Grow file dynamically, by buffer size, or if too small to fit the new sequence by the size of the sequence
                    int suggestedSize = buffer.Length + PlatformManager.Services.DefaultBufferSize;
                    int newSize       = neededSize < suggestedSize ? suggestedSize : neededSize;
                    Array.Resize(ref buffer, newSize);
                    currentBufferSize = newSize;
                }

                byte[] symbols = Encoding.UTF8.GetBytes(line);

                // Array.Copy -- for performance improvement.
                Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length);

                // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet
                if (tryAutoDetectAlphabet)
                {
                    // If we have a base alphabet we detected earlier,
                    // then try that first.
                    if (this.baseAlphabet != null &&
                        this.baseAlphabet.ValidateSequence(buffer, bufferPosition, line.Length))
                    {
                        alphabet = this.baseAlphabet;
                    }
                    // Otherwise attempt to identify alphabet
                    else
                    {
                        // Different alphabet - try to auto detect.
                        this.baseAlphabet = null;
                        alphabet          = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet);
                        if (alphabet == null)
                        {
                            throw new Exception(string.Format(CultureInfo.InvariantCulture,
                                                              Properties.Resource.InvalidSymbolInString, line));
                        }
                    }

                    // Determine the base alphabet used.
                    if (this.baseAlphabet == null)
                    {
                        this.baseAlphabet = alphabet;
                    }
                    else
                    {
                        // If they are not the same, then this might be an error.
                        if (this.baseAlphabet != alphabet)
                        {
                            // If the new alphabet includes all the base alphabet then use it instead.
                            // This happens when we hit an ambiguous form of the alphabet later in the file.
                            if (!this.baseAlphabet.HasAmbiguity && Alphabets.GetAmbiguousAlphabet(this.baseAlphabet) == alphabet)
                            {
                                this.baseAlphabet = alphabet;
                            }
                            else if (alphabet.HasAmbiguity || Alphabets.GetAmbiguousAlphabet(alphabet) != this.baseAlphabet)
                            {
                                throw new Exception(Properties.Resource.FastAContainsMorethanOnebaseAlphabet);
                            }
                        }
                    }
                }
                else
                {
                    // Validate against supplied alphabet.
                    if (!alphabet.ValidateSequence(buffer, bufferPosition, line.Length))
                    {
                        throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line));
                    }
                }

                bufferPosition += line.Length;

                if (reader.Peek() == (byte)'>')
                {
                    break;
                }

                // Read next line.
                line = reader.ReadLine();

                // Continue reading if blank line found.
                while (line != null && string.IsNullOrEmpty(line) && reader.Peek() != (byte)'>')
                {
                    line = reader.ReadLine();
                }
            }while (line != null);

            // Truncate buffer to remove trailing 0's
            byte[] tmpBuffer = new byte[bufferPosition];
            Array.Copy(buffer, tmpBuffer, bufferPosition);

            if (tryAutoDetectAlphabet)
            {
                alphabet = this.baseAlphabet;
            }

            // In memory sequence
            return(new Sequence(alphabet, tmpBuffer, false)
            {
                ID = name
            });
        }

예제 #14

파일 보기

        /// <summary>
        /// Convert the delta alignment object to its sequence representation
        /// </summary>
        /// <returns>Reference sequence alignment at 0th index and
        /// Query sequence alignment at 1st index</returns>
        public PairwiseAlignedSequence ConvertDeltaToSequences()
        {
            PairwiseAlignedSequence alignedSequence = new PairwiseAlignedSequence();
            int         gap          = 0;
            List <long> startOffsets = new List <long>(2);
            List <long> endOffsets   = new List <long>(2);
            List <long> insertions   = new List <long>(2);

            startOffsets.Add(FirstSequenceStart);
            startOffsets.Add(SecondSequenceStart);
            endOffsets.Add(FirstSequenceEnd);
            endOffsets.Add(SecondSequenceEnd);

            insertions.Add(0);
            insertions.Add(0);

            // Create the new sequence object with given start and end indices
            List <byte> referenceSequence = new List <byte>();

            for (long index = this.FirstSequenceStart; index <= this.FirstSequenceEnd; index++)
            {
                referenceSequence.Add(this.ReferenceSequence[index]);
            }

            List <byte> querySequence = new List <byte>();

            for (long index = this.SecondSequenceStart; index <= this.SecondSequenceEnd; index++)
            {
                querySequence.Add(this.QuerySequence[index]);
            }
            // Insert the Alignment character at delta position
            // +ve delta: Insertion in reference sequence
            // -ve delta: Insertion in query sequence (deletion in reference sequence)
            foreach (int delta in Deltas)
            {
                gap += Math.Abs(delta);
                if (delta < 0)
                {
                    referenceSequence.Insert(gap - 1, DnaAlphabet.Instance.Gap);
                    insertions[0]++;
                }
                else
                {
                    querySequence.Insert(gap - 1, DnaAlphabet.Instance.Gap);
                    insertions[1]++;
                }
            }

            byte[]    refSeq   = referenceSequence.ToArray();
            IAlphabet alphabet = Alphabets.AutoDetectAlphabet(refSeq, 0, refSeq.LongLength, null);

            alignedSequence.FirstSequence = new Sequence(alphabet, refSeq, false);

            byte[] querySeq = querySequence.ToArray();
            alphabet = Alphabets.AutoDetectAlphabet(querySeq, 0, querySeq.LongLength, QuerySequence.Alphabet);
            alignedSequence.SecondSequence = new Sequence(alphabet, querySeq, false);

            alignedSequence.Metadata["StartOffsets"] = startOffsets;
            alignedSequence.Metadata["EndOffsets"]   = endOffsets;
            alignedSequence.Metadata["Insertions"]   = insertions;

            return(alignedSequence);
        }

예제 #15

파일 보기

        /// <summary>
        /// Get all the gaps in each sequence and call pairwise alignment.
        /// </summary>
        /// <param name="referenceSequence">Reference sequence.</param>
        /// <param name="sequence">Query sequence.</param>
        /// <param name="mums">List of MUMs.</param>
        /// <returns>Aligned sequences.</returns>
        private PairwiseAlignedSequence ProcessGaps(
            ISequence referenceSequence,
            ISequence sequence,
            IList <Match> mums)
        {
            List <byte>             sequenceResult1 = new List <byte>();
            List <byte>             sequenceResult2 = new List <byte>();
            List <byte>             consensusResult = new List <byte>();
            PairwiseAlignedSequence alignedSequence = new PairwiseAlignedSequence();
            Match mum1;
            Match mum2;

            // Run the alignment for gap before first MUM
            List <long> insertions = new List <long>(2);

            insertions.Add(0);
            insertions.Add(0);

            List <long> gapInsertions;

            mum1 = mums.First();
            alignedSequence.Score += this.AlignGap(
                referenceSequence,
                sequence,
                sequenceResult1,
                sequenceResult2,
                consensusResult,
                new Match()
            {
                Length = 0
            },                                  // Here the first MUM does not exist
                mum1,
                out gapInsertions);

            insertions[0] += gapInsertions[0];
            insertions[1] += gapInsertions[1];

            // Run the alignment for all the gaps between MUM
            for (int index = 1; index < mums.Count; index++)
            {
                mum2 = mums[index];

                alignedSequence.Score += this.AlignGap(
                    referenceSequence,
                    sequence,
                    sequenceResult1,
                    sequenceResult2,
                    consensusResult,
                    mum1,
                    mum2,
                    out gapInsertions);

                insertions[0] += gapInsertions[0];
                insertions[1] += gapInsertions[1];

                mum1 = mum2;
            }

            // Run the alignment for gap after last MUM
            alignedSequence.Score += this.AlignGap(
                referenceSequence,
                sequence,
                sequenceResult1,
                sequenceResult2,
                consensusResult,
                mum1,
                new Match()
            {
                Length = 0
            },
                out gapInsertions);

            insertions[0] += gapInsertions[0];
            insertions[1] += gapInsertions[1];

            byte[]    result1  = sequenceResult1.ToArray();
            IAlphabet alphabet = Alphabets.AutoDetectAlphabet(result1, 0, result1.LongLength, referenceSequence.Alphabet);

            alignedSequence.FirstSequence = new Sequence(
                alphabet,
                result1)
            {
                ID       = referenceSequence.ID,
                Metadata = referenceSequence.Metadata
            };

            byte[] result2 = sequenceResult2.ToArray();
            alphabet = Alphabets.AutoDetectAlphabet(result2, 0, result2.LongLength, sequence.Alphabet);

            alignedSequence.SecondSequence = new Sequence(
                alphabet,
                result2)
            {
                ID       = sequence.ID,
                Metadata = sequence.Metadata
            };

            byte[] consensus = consensusResult.ToArray();
            alphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.LongLength, referenceSequence.Alphabet);
            alignedSequence.Consensus = new Sequence(
                alphabet,
                consensus);

            // Offset is not required as Smith Waterman will  fragmented alignment.
            // Offset is the starting position of alignment of sequence1 with respect to sequence2.
            if (this.PairWiseAlgorithm is NeedlemanWunschAligner)
            {
                alignedSequence.FirstOffset = alignedSequence.FirstSequence.IndexOfNonGap() -
                                              referenceSequence.IndexOfNonGap();
                alignedSequence.SecondOffset = alignedSequence.SecondSequence.IndexOfNonGap() -
                                               sequence.IndexOfNonGap();
            }

            List <long> startOffsets = new List <long>(2);
            List <long> endOffsets   = new List <long>(2);

            startOffsets.Add(0);
            startOffsets.Add(0);

            endOffsets.Add(referenceSequence.Count - 1);
            endOffsets.Add(sequence.Count - 1);

            alignedSequence.Metadata["StartOffsets"] = startOffsets;
            alignedSequence.Metadata["EndOffsets"]   = endOffsets;
            alignedSequence.Metadata["Insertions"]   = insertions;

            // return the aligned sequence
            return(alignedSequence);
        }

예제 #16

파일 보기

        /// <summary>
        /// Parses a single biological sequence alignment text from a reader.
        /// </summary>
        /// <param name="reader">A reader for a biological sequence alignment text.</param>
        /// <returns>The parsed ISequenceAlignment object.</returns>
        ISequenceAlignment ParseOne(TextReader reader)
        {
            ReadNextLine(reader);
            if (line == null)
            {
                throw new Exception(Properties.Resource.INVALID_INPUT_FILE);
            }

            this.ParseHeader(reader);

            var            alignedSequence = new AlignedSequence();
            IList <string> ids             = null;
            bool           isInBlock       = true;

            if (this.line.StartsWith("begin", StringComparison.OrdinalIgnoreCase))
            {
                while (this.line != null && isInBlock)
                {
                    if (string.IsNullOrEmpty(this.line.Trim()))
                    {
                        this.ReadNextLine(reader);
                        continue;
                    }

                    string blockName = GetTokens(this.line)[1];

                    switch (blockName.ToUpperInvariant())
                    {
                    case "TAXA":
                    case "TAXA;":
                        // This block contains the count of sequence & title of each sequence
                        ids = this.ParseTaxaBlock(reader);
                        break;

                    case "CHARACTERS":
                    case "CHARACTERS;":
                        // Block contains sequences
                        Dictionary <string, string> dataSet = this.ParseCharacterBlock(reader, ids);
                        IAlphabet alignmentAlphabet         = null;

                        foreach (string id in ids)
                        {
                            IAlphabet alphabet = this.Alphabet;
                            string    data     = dataSet[id];

                            if (null == alphabet)
                            {
                                byte[] dataArray = data.ToByteArray();
                                alphabet = Alphabets.AutoDetectAlphabet(dataArray, 0, dataArray.Length, null);

                                if (null == alphabet)
                                {
                                    throw new InvalidDataException(string.Format(
                                                                       CultureInfo.InvariantCulture,
                                                                       Properties.Resource.InvalidSymbolInString,
                                                                       data));
                                }

                                if (null == alignmentAlphabet)
                                {
                                    alignmentAlphabet = alphabet;
                                }
                                else
                                {
                                    if (alignmentAlphabet != alphabet)
                                    {
                                        throw new InvalidDataException(string.Format(
                                                                           CultureInfo.InvariantCulture,
                                                                           Properties.Resource.SequenceAlphabetMismatch));
                                    }
                                }
                            }

                            alignedSequence.Sequences.Add(new Sequence(alphabet, data)
                            {
                                ID = id
                            });
                        }

                        break;

                    case "END":
                    case "END;":
                        // Have reached the end of block
                        isInBlock = false;
                        break;

                    default:
                        // skip this block
                        while (this.line != null)
                        {
                            this.ReadNextLine(reader);
                            if (0 == string.Compare(this.line, "end;", StringComparison.OrdinalIgnoreCase))
                            {
                                break;
                            }
                        }
                        break;
                    }

                    this.ReadNextLine(reader);
                }
            }

            ISequenceAlignment sequenceAlignment = new SequenceAlignment();

            sequenceAlignment.AlignedSequences.Add(alignedSequence);
            return(sequenceAlignment);
        }

예제 #17

파일 보기

        /// <summary>
        /// Parses a single biological sequence alignment text from a reader.
        /// </summary>
        /// <param name="reader">A reader for a biological sequence alignment text.</param>
        /// <returns>The parsed ISequenceAlignment object.</returns>
        public ISequenceAlignment ParseOne(TextReader reader)
        {
            if (reader == null)
            {
                throw new ArgumentNullException("reader");
            }

            ReadNextLine(reader);
            if (line == null)
            {
                string message = Properties.Resource.INVALID_INPUT_FILE;
                Trace.Report(message);
                throw new FileFormatException(message);
            }
            else
            {
                ParseHeader(reader);

                string             message           = string.Empty;
                ISequenceAlignment sequenceAlignment = new SequenceAlignment();
                sequenceAlignment.AlignedSequences.Add(new AlignedSequence());
                IList <string> ids       = null;
                bool           isInBlock = true;

                if (line.StartsWith("begin", StringComparison.OrdinalIgnoreCase))
                {
                    while (line != null && isInBlock)
                    {
                        if (string.IsNullOrEmpty(line.Trim()))
                        {
                            ReadNextLine(reader);
                            continue;
                        }

                        string blockName = GetTokens(line)[1];

                        switch (blockName.ToUpper(CultureInfo.InvariantCulture))
                        {
                        case "TAXA":
                        case "TAXA;":
                            // This block contains the count of sequence & title of each sequence
                            ids = (IList <string>)ParseTaxaBlock(reader);

                            break;

                        case "CHARACTERS":
                        case "CHARACTERS;":
                            // Block contains sequences
                            Dictionary <string, string> dataSet = ParseCharacterBlock(reader, ids);

                            IAlphabet alignmentAlphabet = null;
                            string    data = string.Empty;

                            foreach (string ID in ids)
                            {
                                IAlphabet alphabet = Alphabet;
                                Sequence  sequence = null;
                                data = dataSet[ID];

                                if (null == alphabet)
                                {
                                    byte[] dataArray = data.Select(a => (byte)a).ToArray();
                                    alphabet = Alphabets.AutoDetectAlphabet(dataArray, 0, dataArray.Length, null);

                                    if (null == alphabet)
                                    {
                                        message = string.Format(
                                            CultureInfo.InvariantCulture,
                                            Properties.Resource.InvalidSymbolInString,
                                            data);
                                        throw new InvalidDataException(message);
                                    }
                                    else
                                    {
                                        if (null == alignmentAlphabet)
                                        {
                                            alignmentAlphabet = alphabet;
                                        }
                                        else
                                        {
                                            if (alignmentAlphabet != alphabet)
                                            {
                                                message = string.Format(
                                                    CultureInfo.InvariantCulture,
                                                    Properties.Resource.SequenceAlphabetMismatch);
                                                throw new InvalidDataException(message);
                                            }
                                        }
                                    }
                                }

                                sequence    = new Sequence(alphabet, data);
                                sequence.ID = ID;
                                sequenceAlignment.AlignedSequences[0].Sequences.Add(sequence);
                            }

                            break;

                        case "END":
                        case "END;":
                            // Have reached the end of block
                            isInBlock = false;

                            break;

                        default:
                            // skip this block
                            while (line != null)
                            {
                                ReadNextLine(reader);
                                if (0 == string.Compare(line, "end;", StringComparison.OrdinalIgnoreCase))
                                {
                                    break;
                                }
                            }

                            break;
                        }

                        ReadNextLine(reader);
                    }
                }

                return(sequenceAlignment);
            }
        }

예제 #18

파일 보기

        /// <summary>
        /// Launches the alignment algorithm
        /// </summary>
        public virtual List <IPairwiseSequenceAlignment> Align()
        {
            InitializeCache();

            // Grid
            for (int diagonal = 0; diagonal < gridCols + gridRows - 2; diagonal++)
            {
                for (int blockRow = 0; blockRow < gridRows; blockRow++)
                {
                    int blockCol = diagonal - blockRow;

                    if ((blockCol >= 0) && (blockCol < gridCols))
                    {
                        int lastRow = (blockRow == gridRows - 1) ? (int)(colHeight - Math.BigMul(blockRow, gridStride) - 1) : gridStride;
                        int lastCol = (blockCol == gridCols - 1) ? (int)(rowWidth - Math.BigMul(blockCol, gridStride) - 1) : gridStride;

                        ComputeIntermediateBlock(blockRow, blockCol, lastRow, lastCol);
                    }
                }
            }

            sbyte[][] trace = new sbyte[gridStride + 1][];
            for (int i = 0; i <= gridStride; i++)
            {
                trace[i] = new sbyte[gridStride + 1];
            }

            // Last Block - grid calculation and Traceback combined
            int completeTraceRow = gridRows - 1;
            int completeTraceCol = gridCols - 1;

            int completeLastRow = (int)(colHeight - Math.BigMul(completeTraceRow, gridStride) - 1);
            int completeLastCol = (int)(rowWidth - Math.BigMul(completeTraceCol, gridStride) - 1);

            ComputeCornerBlock(completeTraceRow, completeTraceCol, completeLastRow, completeLastCol, trace);

            //Traceback
            if (optScoreCells.Count == 0)
            {
                return(new List <IPairwiseSequenceAlignment>());
            }
            else
            {
                PairwiseSequenceAlignment alignment = new PairwiseSequenceAlignment(sequenceI, sequenceJ);

                for (int alignmentCount = 0; alignmentCount < optScoreCells.Count; alignmentCount++)
                {
                    PairwiseAlignedSequence result = new PairwiseAlignedSequence();
                    result.Score = optScore;

                    long alignmentRow = optScoreCells[alignmentCount].Item1;
                    long alignmentCol = optScoreCells[alignmentCount].Item2;

                    int blockRow = (int)(alignmentRow / gridStride);
                    int blockCol = (int)(alignmentCol / gridStride);

                    int lastRow = (int)(alignmentRow - Math.BigMul(blockRow, gridStride));
                    int lastCol = (int)(alignmentCol - Math.BigMul(blockCol, gridStride));

                    result.Metadata["EndOffsets"] = new List <long> {
                        alignmentRow - 1, alignmentCol - 1
                    };

                    long   alignmentLength = 0;
                    byte[] sequence1       = new byte[colHeight + rowWidth];
                    byte[] sequence2       = new byte[colHeight + rowWidth];

                    int colGaps = 0;
                    int rowGaps = 0;

                    while ((blockRow >= 0) && (blockCol >= 0))
                    {
                        if ((blockRow != completeTraceRow) || (blockCol != completeTraceCol) || (lastRow > completeLastRow) || (lastCol > completeLastCol))
                        {
                            ComputeTraceBlock(blockRow, blockCol, lastRow, lastCol, trace);

                            completeTraceRow = blockRow;
                            completeTraceCol = blockCol;

                            completeLastRow = lastRow;
                            completeLastCol = lastCol;
                        }

                        long startPositionI = blockRow * gridStride - 1;
                        long startPositionJ = blockCol * gridStride - 1;

                        while ((trace[lastRow][lastCol] != SourceDirection.Stop) && (trace[lastRow][lastCol] != SourceDirection.Block))
                        {
                            switch (trace[lastRow][lastCol])
                            {
                            case SourceDirection.Diagonal:
                                // diagonal, no gap, use both sequence residues
                                sequence1[alignmentLength] = sequenceI[startPositionI + lastRow];
                                sequence2[alignmentLength] = sequenceJ[startPositionJ + lastCol];
                                alignmentLength++;
                                lastRow--;
                                lastCol--;
                                break;

                            case SourceDirection.Up:
                                // up, gap in J
                                sequence1[alignmentLength] = sequenceI[startPositionI + lastRow];
                                sequence2[alignmentLength] = this.gapCode;
                                alignmentLength++;
                                lastRow--;
                                colGaps++;
                                break;

                            case SourceDirection.Left:
                                // left, gap in I
                                sequence1[alignmentLength] = this.gapCode;
                                sequence2[alignmentLength] = sequenceJ[startPositionJ + lastCol];
                                alignmentLength++;
                                lastCol--;
                                rowGaps++;
                                break;
                            }
                        }

                        if (trace[lastRow][lastCol] == SourceDirection.Stop)
                        {
                            // Be nice, turn aligned solutions around so that they match the input sequences
                            byte[] alignedA = new byte[alignmentLength];
                            byte[] alignedB = new byte[alignmentLength];
                            for (long i = 0, j = alignmentLength - 1; i < alignmentLength; i++, j--)
                            {
                                alignedA[i] = sequence1[j];
                                alignedB[i] = sequence2[j];
                            }

                            // If alphabet of inputA is DnaAlphabet then alphabet of alignedA may be Dna or AmbiguousDna.
                            IAlphabet alphabet = Alphabets.AutoDetectAlphabet(alignedA, 0, alignedA.LongLength, sequenceI.Alphabet);
                            Sequence  seq      = new Sequence(alphabet, alignedA, false);
                            seq.ID = sequenceI.ID;
                            // seq.DisplayID = aInput.DisplayID;
                            result.FirstSequence = seq;

                            alphabet = Alphabets.AutoDetectAlphabet(alignedB, 0, alignedB.LongLength, sequenceJ.Alphabet);
                            seq      = new Sequence(alphabet, alignedB, false);
                            seq.ID   = sequenceJ.ID;
                            // seq.DisplayID = bInput.DisplayID;
                            result.SecondSequence = seq;

                            // Offset is start of alignment in input sequence with respect to other sequence.
                            if (lastCol >= lastRow)
                            {
                                result.FirstOffset  = lastCol - lastRow;
                                result.SecondOffset = 0;
                            }
                            else
                            {
                                result.FirstOffset  = 0;
                                result.SecondOffset = lastRow - lastCol;
                            }
                            result.Metadata["StartOffsets"] = new List <long> {
                                lastRow, lastCol
                            };
                            result.Metadata["Insertions"] = new List <long> {
                                rowGaps, colGaps
                            };
                            alignment.PairwiseAlignedSequences.Add(result);

                            break;
                        }
                        else
                        {
                            if (lastRow == 0 && lastCol == 0)
                            {
                                blockRow--;
                                blockCol--;
                                lastRow = gridStride;
                                lastCol = gridStride;
                            }
                            else
                            {
                                if (lastRow == 0)
                                {
                                    blockRow--;
                                    lastRow = gridStride;
                                }
                                else
                                {
                                    blockCol--;
                                    lastCol = gridStride;
                                }
                            }
                        }
                    }
                }

                return(new List <IPairwiseSequenceAlignment>()
                {
                    alignment
                });
            }
        }

예제 #19

파일 보기

        /// <summary>
        /// Returns an IEnumerable of DeltaAlignment in the file being parsed.
        /// </summary>
        /// <returns>Returns DeltaAlignment collection.</returns>
        public IList <IEnumerable <DeltaAlignment> > Parse()
        {
            bool skipBlankLine     = true;
            int  currentBufferSize = BufferSize;

            byte[]    buffer   = new byte[currentBufferSize];
            IAlphabet alphabet = null;
            List <IEnumerable <DeltaAlignment> > result = new List <IEnumerable <DeltaAlignment> >();
            IList <DeltaAlignment> deltaAlignments      = new List <DeltaAlignment>();
            string message = string.Empty;

            using (StreamReader streamReader = new StreamReader(this.Filename))
            {
                if (streamReader.EndOfStream)
                {
                    message = string.Format(
                        CultureInfo.InvariantCulture,
                        Resources.INVALID_INPUT_FILE,
                        Resources.Parser_Name);

                    throw new FileFormatException(message);
                }

                ReadNextLine(streamReader);

                do
                {
                    if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
                    {
                        message = string.Format(
                            CultureInfo.InvariantCulture,
                            Resources.INVALID_INPUT_FILE,
                            Resources.Parser_Name);

                        throw new FileFormatException(message);
                    }

                    //First line - reference id
                    string referenceId    = line.Substring(1);
                    int    bufferPosition = 0;

                    // Read next line.
                    ReadNextLine(streamReader);

                    //Second line - Query sequence id
                    string queryId = line;

                    //third line - query sequence
                    // Read next line.
                    ReadNextLine(streamReader);

                    // For large files copy the data in memory mapped file.
                    if ((((long)bufferPosition + line.Length) >= MaximumSequenceLength))
                    {
                        throw new ArgumentOutOfRangeException(
                                  string.Format(CultureInfo.CurrentUICulture, Resources.SequenceDataGreaterthan2GB, queryId));
                    }

                    if (((bufferPosition + line.Length) >= currentBufferSize))
                    {
                        Array.Resize <byte>(ref buffer, buffer.Length + BufferSize);
                        currentBufferSize += BufferSize;
                    }

                    byte[] symbols = ASCIIEncoding.ASCII.GetBytes(line);

                    // Array.Copy -- for performance improvement.
                    Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length);

                    alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet);
                    if (alphabet == null)
                    {
                        throw new FileFormatException(string.Format(Resources.InvalidSymbolInString, line));
                    }

                    bufferPosition += line.Length;

                    // Truncate buffer to remove trailing 0's

                    byte[] tmpBuffer = new byte[bufferPosition];
                    Array.Copy(buffer, tmpBuffer, bufferPosition);

                    Sequence sequence = null;

                    // In memory sequence
                    sequence    = new Sequence(alphabet, tmpBuffer, false);
                    sequence.ID = queryId;

                    Sequence refEmpty = new Sequence(sequence.Alphabet, "A", false);
                    refEmpty.ID = referenceId;

                    DeltaAlignment deltaAlignment = new DeltaAlignment(refEmpty, sequence);

                    //Fourth line - properties of deltaalignment
                    // Read next line.
                    ReadNextLine(streamReader);

                    string[] deltaAlignmentProperties = line.Split(' ');
                    if (deltaAlignmentProperties != null && deltaAlignmentProperties.Length == 7)
                    {
                        long temp;
                        deltaAlignment.FirstSequenceStart  = long.TryParse(deltaAlignmentProperties[0], out temp) ? temp : 0;
                        deltaAlignment.FirstSequenceEnd    = long.TryParse(deltaAlignmentProperties[1], out temp) ? temp : 0;
                        deltaAlignment.SecondSequenceStart = long.TryParse(deltaAlignmentProperties[2], out temp) ? temp : 0;
                        deltaAlignment.SecondSequenceEnd   = long.TryParse(deltaAlignmentProperties[3], out temp) ? temp : 0;
                        int error;
                        deltaAlignment.Errors           = int.TryParse(deltaAlignmentProperties[4], out error) ? error : 0;
                        deltaAlignment.SimilarityErrors = int.TryParse(deltaAlignmentProperties[5], out error) ? error : 0;
                        deltaAlignment.NonAlphas        = int.TryParse(deltaAlignmentProperties[6], out error) ? error : 0;
                    }

                    //Fifth line - either a 0 - marks the end of the delta alignment or they are deltas
                    while (line != null && !line.StartsWith("*", StringComparison.OrdinalIgnoreCase))
                    {
                        long temp;
                        if (long.TryParse(line, out temp))
                        {
                            deltaAlignment.Deltas.Add(temp);
                        }
                        // Read next line.
                        line = streamReader.ReadLine();

                        // Continue reading if blank line found.
                        while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
                        {
                            line = streamReader.ReadLine();
                        }
                    }

                    deltaAlignments.Add(deltaAlignment);
                    //Read the next line
                    line = streamReader.ReadLine();
                    if (line.StartsWith("--", StringComparison.OrdinalIgnoreCase))
                    {
                        result.Add(deltaAlignments);
                        //clear the inner list
                        deltaAlignments = new List <DeltaAlignment>();

                        //skip until the next valid delta is found
                        do
                        {
                            line = streamReader.ReadLine();
                        }while (line != null && line.StartsWith("--", StringComparison.OrdinalIgnoreCase));
                    }
                }while (line != null);
            }
            return(result);
        }

예제 #20

파일 보기

파일: MUMmerAligner.cs 프로젝트: sjmercer65/bio

        /// <summary>
        /// Get all the gaps in each sequence and call pairwise alignment.
        /// </summary>
        /// <param name="referenceSequence">Reference sequence.</param>
        /// <param name="sequence">Query sequence.</param>
        /// <param name="mums">List of MUMs.</param>
        /// <returns>Aligned sequences.</returns>
        private PairwiseAlignedSequence ProcessGaps(
            ISequence referenceSequence,
            ISequence sequence,
            IList <Match> mums)
        {
            List <byte>             sequenceResult1 = new List <byte>();
            List <byte>             sequenceResult2 = new List <byte>();
            List <byte>             consensusResult = new List <byte>();
            PairwiseAlignedSequence alignedSequence = new PairwiseAlignedSequence();
            Match mum1;
            Match mum2;

            // Run the alignment for gap before first MUM
            List <long> insertions = new List <long>(2);

            insertions.Add(0);
            insertions.Add(0);

            List <long> gapInsertions;

            mum1 = mums.First();
            alignedSequence.Score += this.AlignGap(
                referenceSequence,
                sequence,
                sequenceResult1,
                sequenceResult2,
                consensusResult,
                new Match()
            {
                Length = 0
            },                                  // Here the first MUM does not exist
                mum1,
                out gapInsertions);

            insertions[0] += gapInsertions[0];
            insertions[1] += gapInsertions[1];

            // Run the alignment for all the gaps between MUM
            for (int index = 1; index < mums.Count; index++)
            {
                mum2 = mums[index];

                alignedSequence.Score += this.AlignGap(
                    referenceSequence,
                    sequence,
                    sequenceResult1,
                    sequenceResult2,
                    consensusResult,
                    mum1,
                    mum2,
                    out gapInsertions);

                insertions[0] += gapInsertions[0];
                insertions[1] += gapInsertions[1];

                mum1 = mum2;
            }

            // Run the alignment for gap after last MUM
            alignedSequence.Score += this.AlignGap(
                referenceSequence,
                sequence,
                sequenceResult1,
                sequenceResult2,
                consensusResult,
                mum1,
                new Match()
            {
                Length = 0
            },
                out gapInsertions);

            insertions[0] += gapInsertions[0];
            insertions[1] += gapInsertions[1];

            byte[]    result1  = sequenceResult1.ToArray();
            IAlphabet alphabet = Alphabets.AutoDetectAlphabet(result1, 0, result1.GetLongLength(), referenceSequence.Alphabet);

            alignedSequence.FirstSequence = new Sequence(
                alphabet,
                result1)
            {
                ID = referenceSequence.ID,
                // Do not shallow copy dictionary
                //Metadata = referenceSequence.Metadata
            };

            byte[] result2 = sequenceResult2.ToArray();
            alphabet = Alphabets.AutoDetectAlphabet(result2, 0, result2.GetLongLength(), sequence.Alphabet);

            alignedSequence.SecondSequence = new Sequence(
                alphabet,
                result2)
            {
                ID = sequence.ID,
                // Do not shallow copy dictionary
                //Metadata = sequence.Metadata
            };

            byte[] consensus = consensusResult.ToArray();
            alphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.GetLongLength(), referenceSequence.Alphabet);
            alignedSequence.Consensus = new Sequence(
                alphabet,
                consensus);

            alignedSequence.FirstOffset  = alignedSequence.FirstSequence.IndexOfNonGap() - referenceSequence.IndexOfNonGap();
            alignedSequence.SecondOffset = alignedSequence.SecondSequence.IndexOfNonGap() - sequence.IndexOfNonGap();


            List <long> startOffsets = new List <long>(2);
            List <long> endOffsets   = new List <long>(2);

            startOffsets.Add(0);
            startOffsets.Add(0);

            endOffsets.Add(referenceSequence.Count - 1);
            endOffsets.Add(sequence.Count - 1);

            alignedSequence.Metadata["StartOffsets"] = startOffsets;
            alignedSequence.Metadata["EndOffsets"]   = endOffsets;
            alignedSequence.Metadata["Insertions"]   = insertions;

            // return the aligned sequence
            return(alignedSequence);
        }

예제 #21

파일 보기

        /// <summary>
        /// Returns an IEnumerable of sequences in the file being parsed.
        /// </summary>
        /// <returns>Returns ISequence arrays.</returns>
        public IEnumerable <ISequence> Parse()
        {
            IAlphabet alphabet          = this.Alphabet;
            IAlphabet baseAlphabet      = null;
            int       currentBufferSize = BufferSize;

            byte[] buffer        = new byte[currentBufferSize];
            bool   skipBlankLine = true;
            string message       = string.Empty;

            bool tryAutoDetectAlphabet;

            if (alphabet == null)
            {
                tryAutoDetectAlphabet = true;
            }
            else
            {
                tryAutoDetectAlphabet = false;
            }

            using (StreamReader streamReader = new StreamReader(this.Filename))
            {
                if (streamReader.EndOfStream)
                {
                    message = string.Format(
                        CultureInfo.InvariantCulture,
                        Bio.Properties.Resource.INVALID_INPUT_FILE,
                        Bio.Properties.Resource.FASTA_NAME);

                    throw new FileFormatException(message);
                }

                string line = streamReader.ReadLine();

                // Continue reading if blank line found.
                while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
                {
                    line = streamReader.ReadLine();
                }

                do
                {
                    if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase))
                    {
                        message = string.Format(
                            CultureInfo.InvariantCulture,
                            Bio.Properties.Resource.INVALID_INPUT_FILE,
                            Bio.Properties.Resource.FASTA_NAME);

                        throw new FileFormatException(message);
                    }

                    string name           = line.Substring(1);
                    int    bufferPosition = 0;

                    if (tryAutoDetectAlphabet)
                    {
                        alphabet = baseAlphabet;
                    }

                    string sequenceTempFileName = string.Empty;

                    // Read next line.
                    line = streamReader.ReadLine();

                    // Continue reading if blank line found.
                    while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
                    {
                        line = streamReader.ReadLine();
                    }

                    if (line == null)
                    {
                        message = string.Format(
                            CultureInfo.InvariantCulture,
                            Properties.Resource.InvalidSymbolInString,
                            string.Empty);
                        throw new FileFormatException(message);
                    }

                    do
                    {
                        // For large files copy the data in memory mapped file.
                        if ((((long)bufferPosition + line.Length) >= MaximumSequenceLength))
                        {
                            throw new ArgumentOutOfRangeException(
                                      string.Format(CultureInfo.CurrentUICulture, Properties.Resource.SequenceDataGreaterthan2GB, name));
                        }

                        if (((bufferPosition + line.Length) >= currentBufferSize))
                        {
                            Array.Resize <byte>(ref buffer, buffer.Length + BufferSize);
                            currentBufferSize += BufferSize;
                        }

                        byte[] symbols = ASCIIEncoding.ASCII.GetBytes(line);

                        // Array.Copy -- for performance improvement.
                        Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length);

                        // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet
                        if (tryAutoDetectAlphabet)
                        {
                            alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet);
                            if (alphabet == null)
                            {
                                throw new FileFormatException(string.Format(Properties.Resource.InvalidSymbolInString, line));
                            }
                        }
                        else if (this.Alphabet != null)
                        {
                            if (!this.Alphabet.ValidateSequence(buffer, bufferPosition, bufferPosition + line.Length))
                            {
                                throw new FileFormatException(string.Format(Properties.Resource.InvalidSymbolInString, line));
                            }
                        }

                        bufferPosition += line.Length;

                        // Read next line.
                        line = streamReader.ReadLine();

                        // Continue reading if blank line found.
                        while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
                        {
                            line = streamReader.ReadLine();
                        }
                    }while (line != null && !line.StartsWith(">", StringComparison.OrdinalIgnoreCase));

                    // Truncate buffer to remove trailing 0's
                    byte[] tmpBuffer = new byte[bufferPosition];
                    Array.Copy(buffer, tmpBuffer, bufferPosition);

                    Sequence sequence = null;

                    if (tryAutoDetectAlphabet)
                    {
                        IAlphabet tmpalphabet     = alphabet;
                        IAlphabet tmpbaseAlphabet = null;
                        while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(tmpalphabet, out tmpbaseAlphabet))
                        {
                            tmpalphabet = tmpbaseAlphabet;
                        }

                        if (tmpbaseAlphabet == null)
                        {
                            tmpbaseAlphabet = tmpalphabet;
                        }

                        if (baseAlphabet == null)
                        {
                            baseAlphabet = tmpbaseAlphabet;
                        }

                        if (tmpbaseAlphabet != baseAlphabet)
                        {
                            throw new FileFormatException(Properties.Resource.FastAContainsMorethanOnebaseAlphabet);
                        }
                    }

                    // In memory sequence
                    sequence    = new Sequence(alphabet, tmpBuffer, false);
                    sequence.ID = name;
                    yield return(sequence);
                }while (line != null);
            }
        }

예제 #22

파일 보기

        /// <summary>
        /// Gets the IEnumerable of QualitativeSequences from the stream being parsed.
        /// </summary>
        /// <param name="streamReader">Stream to be parsed.</param>
        /// <returns>Returns a QualitativeSequence.</returns>
        private QualitativeSequence ParseOne(StreamReader streamReader)
        {
            IAlphabet       alphabet = this.Alphabet;
            bool            autoDetectFastQFormat = this.AutoDetectFastQFormat;
            FastQFormatType formatType            = this.FormatType;
            bool            skipBlankLine         = true;

            bool tryAutoDetectAlphabet;

            if (alphabet == null)
            {
                tryAutoDetectAlphabet = true;
            }
            else
            {
                tryAutoDetectAlphabet = false;
            }

            if (streamReader.EndOfStream)
            {
                string exMessage = string.Format(
                    CultureInfo.InvariantCulture,
                    Properties.Resource.INVALID_INPUT_FILE,
                    Properties.Resource.FastQName);

                throw new FileFormatException(exMessage);
            }

            string message = string.Empty;

            string line = streamReader.ReadLine();

            // Continue reading if blank line found.
            while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
            {
                line = streamReader.ReadLine();
            }

            if (line == null || !line.StartsWith("@", StringComparison.Ordinal))
            {
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.INVALID_INPUT_FILE, this.Name);
                throw new FileFormatException(message);
            }

            // Process header line.
            string id = line.Substring(1).Trim();

            line = streamReader.ReadLine();

            // Continue reading if blank line found.
            while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
            {
                line = streamReader.ReadLine();
            }

            if (string.IsNullOrEmpty(line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidSequenceLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1);
                throw new FileFormatException(message);
            }

            // Get sequence from second line.
            byte[] sequenceData = UTF8Encoding.UTF8.GetBytes(line);

            // Goto third line.
            line = streamReader.ReadLine();

            // Continue reading if blank line found.
            while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
            {
                line = streamReader.ReadLine();
            }

            // Check for '+' symbol in the third line.
            if (line == null || !line.StartsWith("+", StringComparison.Ordinal))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoreHeaderLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1);
                throw new FileFormatException(message);
            }

            string qualScoreId = line.Substring(1).Trim();

            if (!string.IsNullOrEmpty(qualScoreId) && !id.Equals(qualScoreId))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoreHeaderData, id);
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1);
                throw new FileFormatException(message);
            }

            // Goto fourth line.
            line = streamReader.ReadLine();

            // Continue reading if blank line found.
            while (skipBlankLine && line != null && string.IsNullOrEmpty(line))
            {
                line = streamReader.ReadLine();
            }

            if (string.IsNullOrEmpty(line))
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_EmptyQualityScoreLine, id);
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1);
                throw new FileFormatException(message);
            }

            // Get the quality scores from the fourth line.
            byte[] qualScores = UTF8Encoding.UTF8.GetBytes(line);

            // Check for sequence length and quality score length.
            if (sequenceData.LongLength() != qualScores.LongLength())
            {
                string message1 = string.Format(CultureInfo.CurrentCulture, Properties.Resource.FastQ_InvalidQualityScoresLength, id);
                message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.IOFormatErrorMessage, this.Name, message1);
                throw new FileFormatException(message);
            }

            // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet
            if (tryAutoDetectAlphabet)
            {
                alphabet = Alphabets.AutoDetectAlphabet(sequenceData, 0, sequenceData.LongLength(), alphabet);
                if (alphabet == null)
                {
                    throw new FileFormatException(Properties.Resource.CouldNotIdentifyAlphabetType);
                }
            }
            else if (alphabet != null)
            {
                if (!alphabet.ValidateSequence(sequenceData, 0, sequenceData.LongLength()))
                {
                    throw new FileFormatException(Properties.Resource.InvalidAlphabetType);
                }
            }

            // Identify fastq format type if AutoDetectFastQFormat property is set to true.
            if (autoDetectFastQFormat)
            {
                formatType = IdentifyFastQFormatType(qualScores);
            }

            QualitativeSequence qualitativeSequence = new QualitativeSequence(alphabet, formatType, sequenceData, qualScores, false);

            qualitativeSequence.ID = id;

            // Update the propeties so that next parse will use this data.
            this.FormatType = formatType;

            return(qualitativeSequence);
        }