/// <summary> /// Identifies Alphabet for the sepecified sequence. /// </summary> /// <param name="currentAlphabet">Currently known alphabet of the sequence, null if alphabet is unknown.</param> /// <param name="sequence">Sequence data.</param> /// <returns>Returns appropriate alphabet for the specified sequence and considering the specified current alphabet. /// Returns null if any character in the sequence is unrecognized by DNA, RNA and Protien Alphabets.</returns> protected IAlphabet IdentifyAlphabet(IAlphabet currentAlphabet, string sequence) { if (string.IsNullOrEmpty(sequence)) { return(null); } // Alphabets use upper case characters so to prevent parsing errors ensure the sequence // is all upper case. sequence = sequence.ToUpperInvariant(); // This is much faster than performing sequence.ToCharArray().Distinct() int characters = 0; if (currentAlphabet != null) { characters = Alphabets.GetHighestChar(currentAlphabet) + 1; } else { foreach (var item in Alphabets.All) { characters = Math.Max(characters, Alphabets.GetHighestChar(item) + 1); } } var characterExists = new bool[characters]; for (int i = 0; i < characterExists.Length; i++) { characterExists[i] = false; } var uniqueValues = new StringBuilder(); for (int i = 0; i < sequence.Length; i++) { char sequenceChar = sequence[i]; if (!characterExists[sequenceChar]) { characterExists[sequenceChar] = true; uniqueValues.Append(sequenceChar); } } bool canClearDistinctSymbol = false; if (_distinctSymbols != null) { _distinctSymbols = _distinctSymbols.Union(uniqueValues.ToString()).ToList(); } else { canClearDistinctSymbol = true; _distinctSymbols = uniqueValues.ToString().ToCharArray().ToList(); } IAlphabet alphabet = null; if (currentAlphabet == Alphabets.Protein) { alphabet = StartCheckFromProtein(); } else if (currentAlphabet == Alphabets.RNA) { alphabet = StartCheckFromRna(); } else { alphabet = StartCheckFromDna(); } if (canClearDistinctSymbol) { _distinctSymbols = null; } return(alphabet); }