void ValidateGetValidSymbols(AlphabetsTypes option) { string referenceCharacters = ""; IAlphabet alphabetInstance = null; switch (option) { case AlphabetsTypes.Protein: referenceCharacters = "AaCcDdEeFfGgHhIiKkLlMmNnOoPpQqRrSsTtUuVvWwYy-*"; alphabetInstance = ProteinAlphabet.Instance; break; case AlphabetsTypes.Rna: alphabetInstance = RnaAlphabet.Instance; referenceCharacters = "AaCcGgUu-"; break; case AlphabetsTypes.Dna: alphabetInstance = DnaAlphabet.Instance; referenceCharacters = "AaCcGgTt-"; break; } HashSet <byte> validSymbolsByte = new HashSet <byte>(); validSymbolsByte = alphabetInstance.GetValidSymbols(); string validSymbols = new string(validSymbolsByte.Select(a => (char)a).ToArray()); Assert.AreEqual(referenceCharacters, validSymbols); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Alphabets operation ", option, " completed successfully.")); }
// Creates a subsequence from a source sequence given the settings provided private ISequence CreateSubsequence(SimulatorSettings settings, long index) { double err = (double)settings.ErrorFrequency; // Set the length using the appropriate random number distribution type long subLength = settings.SequenceLength; if (settings.DistributionType == (int)Distribution.Uniform) { subLength += random.Next(settings.LengthVariation * 2) - settings.LengthVariation; } else if (settings.DistributionType == (int)Distribution.Normal) { subLength = (long)Math.Floor(Bio.Util.Helper.GetNormalRandom((double)settings.SequenceLength, (double)settings.LengthVariation)); } // Quick sanity checks on the length of the subsequence if (subLength <= 0) { subLength = 1; } if (subLength > SequenceToSplit.Count) { subLength = SequenceToSplit.Count; } // Set the start position long startPosition = (long)Math.Floor(random.NextDouble() * (SequenceToSplit.Count - subLength)); byte[] sequenceBytes = new byte[subLength]; IAlphabet resultSequenceAlphabet = SequenceToSplit.Alphabet; // Get ambiguity symbols List <byte> errorSource = null; //= Sequence.Alphabet.LookupAll(true, false, settings.AllowAmbiguities, false); if (settings.AllowAmbiguities && (SequenceToSplit.Alphabet == DnaAlphabet.Instance || SequenceToSplit.Alphabet == RnaAlphabet.Instance || SequenceToSplit.Alphabet == ProteinAlphabet.Instance) ) { resultSequenceAlphabet = Alphabets.AmbiguousAlphabetMap[SequenceToSplit.Alphabet]; } errorSource = resultSequenceAlphabet.GetValidSymbols().ToList(); // remove gap and termination symbol HashSet <byte> gaps, terminations; SequenceToSplit.Alphabet.TryGetGapSymbols(out gaps); SequenceToSplit.Alphabet.TryGetTerminationSymbols(out terminations); if (gaps != null) { errorSource.RemoveAll(a => gaps.Contains(a)); } if (terminations != null) { errorSource.RemoveAll(a => terminations.Contains(a)); } for (long i = 0; i < subLength; i++) { // Apply Errors if applicable if (random.NextDouble() < err) { sequenceBytes[i] = errorSource[random.Next(errorSource.Count - 1)]; } else { sequenceBytes[i] = SequenceToSplit[startPosition + i]; } } Sequence generatedSequence = new Sequence(resultSequenceAlphabet, sequenceBytes.ToArray()); generatedSequence.ID = SequenceToSplit.ID + " (Split " + (index + 1) + ", " + generatedSequence.Count + "bp)"; // Reverse Sequence if applicable if (settings.ReverseHalf && random.NextDouble() < 0.5f) { return(new DerivedSequence(generatedSequence, true, false)); } return(generatedSequence); }
/// <summary> /// Creates a subsequence from a source sequence given the settings provided /// </summary> /// <param name="index"></param> /// <param name="sequenceToSplit"></param> /// <param name="simulatorSettings"></param> /// <returns></returns> private ISequence CreateSubsequence(long index, ISequence sequenceToSplit, SimulatorSettings simulatorSettings) { double err = simulatorSettings.ErrorFrequency; // Set the length using the appropriate random number distribution type long subLength = simulatorSettings.SequenceLength; switch (simulatorSettings.DistributionType) { case (int)Distribution.Uniform: subLength += _seqRandom.Next(simulatorSettings.LengthVariation * 2) - simulatorSettings.LengthVariation; break; case (int)Distribution.Normal: subLength = (long)Math.Floor(Bio.Util.Helper.GetNormalRandom(simulatorSettings.SequenceLength, simulatorSettings.LengthVariation)); break; } // Quick sanity checks on the length of the subsequence if (subLength <= 0) { subLength = 1; } if (subLength > sequenceToSplit.Count) { subLength = sequenceToSplit.Count; } // Set the start position long startPosition = (long)Math.Floor(_seqRandom.NextDouble() * (sequenceToSplit.Count - subLength)); byte[] sequenceBytes = new byte[subLength]; IAlphabet resultSequenceAlphabet = sequenceToSplit.Alphabet; // Get ambiguity symbols if (simulatorSettings.AllowAmbiguities && (sequenceToSplit.Alphabet == DnaAlphabet.Instance || sequenceToSplit.Alphabet == RnaAlphabet.Instance || sequenceToSplit.Alphabet == ProteinAlphabet.Instance)) { resultSequenceAlphabet = Alphabets.AmbiguousAlphabetMap[sequenceToSplit.Alphabet]; } List <byte> errorSource = resultSequenceAlphabet.GetValidSymbols().ToList(); // remove gap and termination symbol HashSet <byte> gaps, terminations; sequenceToSplit.Alphabet.TryGetGapSymbols(out gaps); sequenceToSplit.Alphabet.TryGetTerminationSymbols(out terminations); if (gaps != null) { errorSource.RemoveAll(a => gaps.Contains(a)); } if (terminations != null) { errorSource.RemoveAll(a => terminations.Contains(a)); } for (long i = 0; i < subLength; i++) { // Apply Errors if applicable sequenceBytes[i] = _seqRandom.NextDouble() < err ? errorSource[_seqRandom.Next(errorSource.Count - 1)] : sequenceToSplit[startPosition + i]; } ISequence generatedSequence = new Sequence(resultSequenceAlphabet, sequenceBytes.ToArray()); generatedSequence.ID = sequenceToSplit.ID + " (Split " + (index + 1) + ", " + generatedSequence.Count + "bp)"; // Reverse Sequence if applicable return(simulatorSettings.ReverseHalf && _seqRandom.NextDouble() < 0.5f ? new DerivedSequence(generatedSequence, true, true) : generatedSequence); }