// Creates a subsequence from a source sequence given the settings provided private ISequence CreateSubsequence(SimulatorSettings settings, int index) { double err = (double)settings.ErrorFrequency; // Set the length using the appropriate random number distribution type int subLength = settings.SequenceLength; if (settings.DistributionType == (int)Distribution.Uniform) { subLength += random.Next(settings.LengthVariation * 2) - settings.LengthVariation; } else if (settings.DistributionType == (int)Distribution.Normal) { subLength = (int)MBF.Util.Helper.GetNormalRandom((double)settings.SequenceLength, (double)settings.LengthVariation); } // Quick sanity checks on the length of the subsequence if (subLength <= 0) { subLength = 1; } if (subLength > Sequence.Count) { subLength = Sequence.Count; } // Set the start position int startPosition = random.Next(Sequence.Count - subLength); Sequence result = new Sequence(Sequence.Alphabet); result.IsReadOnly = false; List <ISequenceItem> errorSource = Sequence.Alphabet.LookupAll(true, false, settings.AllowAmbiguities, false); for (int i = 0; i < subLength; i++) { // Apply Errors if applicable if (random.NextDouble() < err) { result.Add(errorSource[random.Next(errorSource.Count - 1)]); } else { result.Add(Sequence[startPosition + i]); } } result.ID = Sequence.ID + " (Split " + (index + 1) + ", " + result.Count + "bp)"; // Reverse Sequence if applicable if (settings.ReverseHalf && random.NextDouble() < 0.5f) { return(result.Reverse); } return(result); }
/// <summary> /// Does the logic behind the sequence simulation /// </summary> internal void DoSimulation(SimulatorWindow window, string outputFileName, SimulatorSettings settings) { FileInfo file = new FileInfo(outputFileName); if (!file.Directory.Exists) { throw new ArgumentException("Could not write to the output directory for " + outputFileName); } if (settings.OutputSequenceCount <= 0) { throw new ArgumentException("'Max Output Sequences Per File' should be greater than zero."); } if (settings.SequenceLength <= 0) { throw new ArgumentException("'Mean Output Length' should be greater than zero."); } string filePrefix; if (String.IsNullOrEmpty(file.Extension)) { filePrefix = file.FullName; } else { filePrefix = file.FullName.Substring(0, file.FullName.IndexOf(file.Extension)); } string filePostfix = "_{0}.fa"; long seqCount = (settings.DepthOfCoverage * SequenceToSplit.Count) / settings.SequenceLength; long fileCount = seqCount / settings.OutputSequenceCount; if (seqCount % settings.OutputSequenceCount != 0) { fileCount++; } window.UpdateSimulationStats(seqCount, fileCount); if (generatedSequenceList == null) { generatedSequenceList = new List <ISequence>(); } else { generatedSequenceList.Clear(); } int fileIndex = 1; FastAFormatter formatter = null; for (long i = 0; i < seqCount; i++) { generatedSequenceList.Add(CreateSubsequence(settings, i)); if (generatedSequenceList.Count >= settings.OutputSequenceCount) { FileInfo outFile = new FileInfo(filePrefix + string.Format(filePostfix, fileIndex++)); formatter = new FastAFormatter(outFile.FullName); foreach (ISequence seq in generatedSequenceList) { formatter.Write(seq); } formatter.Close(); generatedSequenceList.Clear(); } } if (generatedSequenceList.Count > 0) { FileInfo outFile = new FileInfo(filePrefix + string.Format(filePostfix, fileIndex++)); formatter = new FastAFormatter(outFile.FullName); foreach (ISequence seq in generatedSequenceList) { formatter.Write(seq); } formatter.Close(); window.NotifySimulationComplete(formatter.Name); } else { window.NotifySimulationComplete(string.Empty); } }
// Creates a subsequence from a source sequence given the settings provided private ISequence CreateSubsequence(SimulatorSettings settings, long index) { double err = (double)settings.ErrorFrequency; // Set the length using the appropriate random number distribution type long subLength = settings.SequenceLength; if (settings.DistributionType == (int)Distribution.Uniform) { subLength += random.Next(settings.LengthVariation * 2) - settings.LengthVariation; } else if (settings.DistributionType == (int)Distribution.Normal) { subLength = (long)Math.Floor(Bio.Util.Helper.GetNormalRandom((double)settings.SequenceLength, (double)settings.LengthVariation)); } // Quick sanity checks on the length of the subsequence if (subLength <= 0) { subLength = 1; } if (subLength > SequenceToSplit.Count) { subLength = SequenceToSplit.Count; } // Set the start position long startPosition = (long)Math.Floor(random.NextDouble() * (SequenceToSplit.Count - subLength)); byte[] sequenceBytes = new byte[subLength]; IAlphabet resultSequenceAlphabet = SequenceToSplit.Alphabet; // Get ambiguity symbols List <byte> errorSource = null; //= Sequence.Alphabet.LookupAll(true, false, settings.AllowAmbiguities, false); if (settings.AllowAmbiguities && (SequenceToSplit.Alphabet == DnaAlphabet.Instance || SequenceToSplit.Alphabet == RnaAlphabet.Instance || SequenceToSplit.Alphabet == ProteinAlphabet.Instance) ) { resultSequenceAlphabet = Alphabets.AmbiguousAlphabetMap[SequenceToSplit.Alphabet]; } errorSource = resultSequenceAlphabet.GetValidSymbols().ToList(); // remove gap and termination symbol HashSet <byte> gaps, terminations; SequenceToSplit.Alphabet.TryGetGapSymbols(out gaps); SequenceToSplit.Alphabet.TryGetTerminationSymbols(out terminations); if (gaps != null) { errorSource.RemoveAll(a => gaps.Contains(a)); } if (terminations != null) { errorSource.RemoveAll(a => terminations.Contains(a)); } for (long i = 0; i < subLength; i++) { // Apply Errors if applicable if (random.NextDouble() < err) { sequenceBytes[i] = errorSource[random.Next(errorSource.Count - 1)]; } else { sequenceBytes[i] = SequenceToSplit[startPosition + i]; } } Sequence generatedSequence = new Sequence(resultSequenceAlphabet, sequenceBytes.ToArray()); generatedSequence.ID = SequenceToSplit.ID + " (Split " + (index + 1) + ", " + generatedSequence.Count + "bp)"; // Reverse Sequence if applicable if (settings.ReverseHalf && random.NextDouble() < 0.5f) { return(new DerivedSequence(generatedSequence, true, false)); } return(generatedSequence); }
/// <summary> /// Constructor /// </summary> public SimulatorController() { _seqRandom = new Random(); Settings = new SimulatorSettings(); }
/// <summary> /// Creates a subsequence from a source sequence given the settings provided /// </summary> /// <param name="index"></param> /// <param name="sequenceToSplit"></param> /// <param name="simulatorSettings"></param> /// <returns></returns> private ISequence CreateSubsequence(long index, ISequence sequenceToSplit, SimulatorSettings simulatorSettings) { double err = simulatorSettings.ErrorFrequency; // Set the length using the appropriate random number distribution type long subLength = simulatorSettings.SequenceLength; switch (simulatorSettings.DistributionType) { case (int)Distribution.Uniform: subLength += _seqRandom.Next(simulatorSettings.LengthVariation * 2) - simulatorSettings.LengthVariation; break; case (int)Distribution.Normal: subLength = (long)Math.Floor(Bio.Util.Helper.GetNormalRandom(simulatorSettings.SequenceLength, simulatorSettings.LengthVariation)); break; } // Quick sanity checks on the length of the subsequence if (subLength <= 0) { subLength = 1; } if (subLength > sequenceToSplit.Count) { subLength = sequenceToSplit.Count; } // Set the start position long startPosition = (long)Math.Floor(_seqRandom.NextDouble() * (sequenceToSplit.Count - subLength)); byte[] sequenceBytes = new byte[subLength]; IAlphabet resultSequenceAlphabet = sequenceToSplit.Alphabet; // Get ambiguity symbols if (simulatorSettings.AllowAmbiguities && (sequenceToSplit.Alphabet == DnaAlphabet.Instance || sequenceToSplit.Alphabet == RnaAlphabet.Instance || sequenceToSplit.Alphabet == ProteinAlphabet.Instance)) { resultSequenceAlphabet = Alphabets.AmbiguousAlphabetMap[sequenceToSplit.Alphabet]; } List <byte> errorSource = resultSequenceAlphabet.GetValidSymbols().ToList(); // remove gap and termination symbol HashSet <byte> gaps, terminations; sequenceToSplit.Alphabet.TryGetGapSymbols(out gaps); sequenceToSplit.Alphabet.TryGetTerminationSymbols(out terminations); if (gaps != null) { errorSource.RemoveAll(a => gaps.Contains(a)); } if (terminations != null) { errorSource.RemoveAll(a => terminations.Contains(a)); } for (long i = 0; i < subLength; i++) { // Apply Errors if applicable sequenceBytes[i] = _seqRandom.NextDouble() < err ? errorSource[_seqRandom.Next(errorSource.Count - 1)] : sequenceToSplit[startPosition + i]; } ISequence generatedSequence = new Sequence(resultSequenceAlphabet, sequenceBytes.ToArray()); generatedSequence.ID = sequenceToSplit.ID + " (Split " + (index + 1) + ", " + generatedSequence.Count + "bp)"; // Reverse Sequence if applicable return(simulatorSettings.ReverseHalf && _seqRandom.NextDouble() < 0.5f ? new DerivedSequence(generatedSequence, true, true) : generatedSequence); }