Esempio n. 1
0
        /// <summary>
        /// Identifies Alphabet for the sepecified sequence.
        /// </summary>
        /// <param name="currentAlphabet">Currently known alphabet of the sequence, null if alphabet is unknown.</param>
        /// <param name="sequence">Sequence data.</param>
        /// <returns>Returns appropriate alphabet for the specified sequence and considering the specified current alphabet.
        /// Returns null if any character in the sequence is unrecognized by DNA, RNA and Protien Alphabets.</returns>
        protected IAlphabet IdentifyAlphabet(IAlphabet currentAlphabet, string sequence)
        {
            if (string.IsNullOrEmpty(sequence))
            {
                return(null);
            }

            // Alphabets use upper case characters so to prevent parsing errors ensure the sequence
            // is all upper case.
            sequence = sequence.ToUpperInvariant();

            // This is much faster than performing sequence.ToCharArray().Distinct()
            int characters = 0;

            if (currentAlphabet != null)
            {
                characters = Alphabets.GetHighestChar(currentAlphabet) + 1;
            }
            else
            {
                foreach (var item in Alphabets.All)
                {
                    characters = Math.Max(characters, Alphabets.GetHighestChar(item) + 1);
                }
            }

            var characterExists = new bool[characters];

            for (int i = 0; i < characterExists.Length; i++)
            {
                characterExists[i] = false;
            }
            var uniqueValues = new StringBuilder();

            for (int i = 0; i < sequence.Length; i++)
            {
                char sequenceChar = sequence[i];
                if (!characterExists[sequenceChar])
                {
                    characterExists[sequenceChar] = true;
                    uniqueValues.Append(sequenceChar);
                }
            }

            bool canClearDistinctSymbol = false;

            if (_distinctSymbols != null)
            {
                _distinctSymbols = _distinctSymbols.Union(uniqueValues.ToString()).ToList();
            }
            else
            {
                canClearDistinctSymbol = true;
                _distinctSymbols       = uniqueValues.ToString().ToCharArray().ToList();
            }

            IAlphabet alphabet = null;

            if (currentAlphabet == Alphabets.Protein)
            {
                alphabet = StartCheckFromProtein();
            }
            else if (currentAlphabet == Alphabets.RNA)
            {
                alphabet = StartCheckFromRna();
            }
            else
            {
                alphabet = StartCheckFromDna();
            }

            if (canClearDistinctSymbol)
            {
                _distinctSymbols = null;
            }

            return(alphabet);
        }