/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and string sequence. /// </summary> /// <param name="alphabet">Alphabet to which this class should conform.</param> /// <param name="sequence">The sequence in string form.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public Sequence(IAlphabet alphabet, string sequence, bool validate) { // validate the inputs if (sequence == null) { throw new ArgumentNullException("sequence"); } if (alphabet == null) { throw new ArgumentNullException("alphabet"); } this.Alphabet = alphabet; this.ID = string.Empty; byte[] values = Encoding.UTF8.GetBytes(sequence); if (validate) { // Validate sequence data if (!alphabet.ValidateSequence(values, 0, values.GetLongLength())) { throw Helper.GenerateAlphabetCheckFailureException(alphabet, values); } } this._sequenceData = values; this.Count = this._sequenceData.GetLongLength(); }
/// <summary> /// Initializes a new instance of the class NGramIndexM1. /// </summary> /// <param name="dictionary">A list of to-be-indexed words.</param> /// <param name="alphabet">The base alphabet. Words which contains characters that not exist in the alphabet are ignored.</param> /// <param name="ngramMap">The index table.</param> /// <param name="n">The length of the n-gram.</param> /// <param name="maxLength">The length of the longest word in the dictionary.</param> public NGramIndexM1(string[] dictionary, IAlphabet alphabet, int[][][] ngramMap, int n, int maxLength) : base(dictionary) { _alphabet = alphabet; _ngramMap = ngramMap; _n = n; _maxLength = maxLength; }
public EnigmaMachine(IAlphabet alphabet, IRotor leftRotor, IRotor centerRotor, IRotor righRotor, IReflector reflector) { _alphabet = alphabet; _leftRotor = leftRotor; _centerRotor = centerRotor; _rightRotor = righRotor; _reflector = reflector; }
/// <summary> /// Initializes a new instance of the AssemblyInputDialog class. /// </summary> /// <param name="IsAlignment">Flags if the operation is alignment or assembly</param> /// <param name="sequenceAlphabet">Alphabet of the selected sequences</param> public AssemblyInputDialog(bool IsAlignment, IAlphabet sequenceAlphabet, ISequenceAligner selectedAligner = null) { this.isAlignment = IsAlignment; this.sequenceAlphabet = sequenceAlphabet; InitializeComponent(); if (isAlignment) { thresholdsPanel.Visibility = Visibility.Hidden; alignerPanel.Visibility = Visibility.Collapsed; headingBlock.Text = Resources["AssemblyInputDialog_AlignInputParameters"].ToString(); } // Add aligners to the drop down foreach (ISequenceAligner aligner in SequenceAligners.All.OrderBy(sa => sa.Name)) { if (!IsAlignment) { // If assembly, load only pairwise aligners if (!(aligner is IPairwiseSequenceAligner)) { continue; } } alignerDropDown.Items.Add(aligner.Name); } // Select Smith-Waterman by default. if (selectedAligner == null) { selectedAligner = SequenceAligners.All.FirstOrDefault( sa => string.Compare(sa.Name, "Smith-Waterman", StringComparison.OrdinalIgnoreCase) == 0); } // Ensure aligner is in our list. if (selectedAligner != null && alignerDropDown.Items.Contains(selectedAligner.Name)) { alignerDropDown.Text = selectedAligner.Name; } // If not, select the first algorithm present. else { alignerDropDown.SelectedIndex = 0; } // Load our parameters. LoadAlignmentArguments(alignerDropDown.Text); this.btnSubmit.Click += this.OnSubmitButtonClicked; this.btnCancel.Click += this.OnCancelClicked; this.alignerDropDown.SelectionChanged += this.OnAlignerChanged; this.btnSubmit.Focus(); }
/// <summary> /// Initializes a new instance of the NGramSearcherM1 class. /// </summary> /// <param name="index"></param> /// <param name="metric"></param> /// <param name="maxDistance"></param> /// <param name="prefix"></param> public NGramSearcherM1(NGramIndexM1 index, Metric metric, int maxDistance, bool prefix) : base(index) { _metric = metric; _maxDistance = maxDistance; _prefix = prefix; _dictionary = index.Dictionary; _alphabet = index.Alphabet; _ngramMap = index.NGramMap; _n = index.N; _maxLength = index.MaxLength; }
/// <summary> /// Parses a list of GFF sequences using a StreamReader. /// </summary> /// <remarks> /// This method is overridden to process file-scope metadata that applies to all /// of the sequences in the file. /// </remarks> /// Flag to indicate whether the resulting sequences should be in read-only mode or not. /// If this flag is set to true then the resulting sequences's isReadOnly property /// will be set to true, otherwise it will be set to false. /// <returns>The list of parsed ISequence objects.</returns> public IEnumerable <ISequence> Parse() { if (string.IsNullOrEmpty(this.Filename)) { throw new ArgumentNullException(this.Filename); } this.sequences = new List <Tuple <ISequence, List <byte> > >(); sequencesInHeader = new List <Tuple <ISequence, List <byte> > >(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = Alphabets.DNA; } commonSeq = new Sequence(alphabet, String.Empty); // The GFF spec says that all headers need to be at the top of the file. string line = ParseHeaders(); // A feature file with no features? May it never be. if (this.streamReader.EndOfStream) { string message = Properties.Resource.GFFNoFeatures; Trace.Report(message); throw new InvalidOperationException(message); } while (line != null) { line = ParseFeatures(line); } CopyMetadata(); List <Sequence> resultSequences = new List <Sequence>(this.sequences.Count); foreach (var curSeq in this.sequences) { resultSequences.Add( new Sequence(curSeq.Item1.Alphabet, curSeq.Item2.ToArray()) { ID = curSeq.Item1.ID, Metadata = curSeq.Item1.Metadata }); } return(resultSequences.ToList()); }
/// <summary> /// Creates a sparse sequence and inserts sequence items at even position of alphabet /// and replaces with sequence string present at odd position. /// Validates if items are replaced as expected. /// </summary> /// <param name="alphabet">alphabet instance.</param> private static void ValidateSparseSequenceReplaceRange(IAlphabet alphabet) { // Create sequence item array ISequenceItem[] sequenceItemArray = new ISequenceItem[alphabet.Count]; int index = 0; foreach (ISequenceItem item in alphabet) { sequenceItemArray[index] = item; index++; } // create list of sequence items at even position. List <ISequenceItem> lstAddSequenceItem = new List <ISequenceItem>(); for (int addIndex = 0; addIndex < alphabet.Count; addIndex = addIndex + 2) { lstAddSequenceItem.Add(sequenceItemArray[addIndex]); } //Create sequence using sequence items at odd position string sequence = string.Empty; List <ISequenceItem> lstNewSequenceItem = new List <ISequenceItem>(); for (int relpaceIndex = 1; relpaceIndex < alphabet.Count; relpaceIndex = relpaceIndex + 2) { sequence += sequenceItemArray[relpaceIndex].Symbol.ToString((IFormatProvider)null); lstNewSequenceItem.Add(sequenceItemArray[relpaceIndex]); } // Create sparse sequence SparseSequence sparseSequence = new SparseSequence(alphabet, 8, lstAddSequenceItem); Assert.AreEqual(lstAddSequenceItem.Count + 8, sparseSequence.Count); // Replace Range and Validate if sparse sequence items are replaced. sparseSequence.IsReadOnly = false; sparseSequence.ReplaceRange(8, sequence); Assert.AreEqual(lstNewSequenceItem.Count + 8, sparseSequence.Count); foreach (ISequenceItem item in lstNewSequenceItem) { Assert.IsTrue(sparseSequence.Contains(item)); } Console.WriteLine( "SparseSequenceP1: Validation of RelpaceRange() method with sequence item is completed"); ApplicationLog.WriteLine( "SparseSequenceP1: Validation of RelpaceRange() method with sequence item is completed"); }
/// <summary> /// Generate IProfiles from a set of aligned sequences /// </summary> /// <param name="sequences">a set of aligned sequences</param> public static IProfiles GenerateProfiles(ICollection <ISequence> sequences) { IProfiles profiles; IEnumerator <ISequence> enumeratorSeq = sequences.GetEnumerator(); enumeratorSeq.MoveNext(); int sequenceLength = enumeratorSeq.Current.Count; IAlphabet alphabet = enumeratorSeq.Current.Alphabet; while (enumeratorSeq.MoveNext()) { if (enumeratorSeq.Current.Count != sequenceLength) { throw new ArgumentException("Input sequences are not aligned"); } if (enumeratorSeq.Current.Alphabet != alphabet) { throw new ArgumentException("Input sequences use different alphabets"); } } int colSize = ItemSet.Count; profiles = new Profiles(sequenceLength, colSize); for (int i = 0; i < sequenceLength; ++i) { enumeratorSeq.Reset(); while (enumeratorSeq.MoveNext()) { if (enumeratorSeq.Current[i].IsAmbiguous) { for (int b = 0; b < AmbiguousCharactersMap[enumeratorSeq.Current[i]].Count; ++b) { ++(profiles[i][ItemSet[AmbiguousCharactersMap[enumeratorSeq.Current[i]][b]]]); } } else { ++(profiles[i][ItemSet[enumeratorSeq.Current[i]]]); } //++(profiles[i][ItemSet[enumeratorSeq.Current[i]]]); } MsaUtils.Normalize(profiles[i]); } profiles.ColumnSize = colSize; profiles.RowSize = sequenceLength; return(profiles); }
/// <summary> /// Initializes a new instance of the QualitativeSequence class with specified alphabet, quality score type, /// byte array representing symbols and quality scores. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="fastQFormatType">FastQ format type.</param> /// <param name="sequence">An array of bytes representing the symbols.</param> /// <param name="qualityScores">An array of bytes representing the quality scores.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public QualitativeSequence(IAlphabet alphabet, FastQFormatType fastQFormatType, byte[] sequence, byte[] qualityScores, bool validate) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (sequence == null) { throw new ArgumentNullException("sequence"); } if (qualityScores == null) { throw new ArgumentNullException("qualityScores"); } this.Alphabet = alphabet; this.ID = string.Empty; this.FormatType = fastQFormatType; if (validate) { // Validate sequence data if (!this.Alphabet.ValidateSequence(sequence, 0, sequence.LongLength())) { throw new ArgumentOutOfRangeException("sequence"); } // Validate quality scores if (!ValidateQualScore(qualityScores, this.FormatType)) { throw new ArgumentOutOfRangeException("qualityScores"); } } this.sequenceData = new byte[sequence.LongLength()]; this.qualityScores = new byte[qualityScores.LongLength()]; #if (SILVERLIGHT == false) Array.Copy(sequence, this.sequenceData, sequence.LongLength); Array.Copy(qualityScores, this.qualityScores, qualityScores.LongLength); #else Array.Copy(sequence, this.sequenceData, sequence.Length); Array.Copy(qualityScores, this.qualityScores, qualityScores.Length); #endif this.Count = this.sequenceData.LongLength(); }
/// <summary> /// Create sparse sequence and insert all sequence items of alphabet. /// </summary> /// <param name="alphabet">Alphabet</param> /// <param name="insertPosition">Position to be inserted</param> /// <returns>Sparse sequence</returns> private SparseSequence CreateSparseSequence(IAlphabet alphabet, int insertPosition) { // Create sequence item list List <ISequenceItem> sequenceList = new List <ISequenceItem>(); foreach (ISequenceItem item in alphabet) { sequenceList.Add(item); } // Store sequence item in sparse sequence object using list of sequence items SparseSequence sparseSeq = new SparseSequence(alphabet, insertPosition, sequenceList); return(sparseSeq); }
/// <summary> /// Parse a single sequencer. /// </summary> /// <param name="bioText">sequence alignment text.</param> /// <param name="alphabet">Alphabet of the sequences.</param> /// <param name="referenceSequences">Reference sequences.</param> private static SAMAlignedSequence ParseSequence(string bioText, IAlphabet alphabet, IList <ISequence> referenceSequences) { const int optionalTokenStartingIndex = 11; string[] tokens = bioText.Split(tabDelim, StringSplitOptions.RemoveEmptyEntries); SAMAlignedSequence alignedSeq = new SAMAlignedSequence(); alignedSeq.QName = tokens[0]; alignedSeq.Flag = SAMAlignedSequenceHeader.GetFlag(tokens[1]); alignedSeq.RName = tokens[2]; alignedSeq.Pos = int.Parse(tokens[3], CultureInfo.InvariantCulture); alignedSeq.MapQ = int.Parse(tokens[4], CultureInfo.InvariantCulture); alignedSeq.CIGAR = tokens[5]; alignedSeq.MRNM = tokens[6].Equals("=") ? alignedSeq.RName : tokens[6]; alignedSeq.MPos = int.Parse(tokens[7], CultureInfo.InvariantCulture); alignedSeq.ISize = int.Parse(tokens[8], CultureInfo.InvariantCulture); ISequence refSeq = null; if (referenceSequences != null && referenceSequences.Count > 0) { refSeq = referenceSequences.FirstOrDefault(R => string.Compare(R.ID, alignedSeq.RName, StringComparison.OrdinalIgnoreCase) == 0); } ParseQualityNSequence(alignedSeq, alphabet, tokens[9], tokens[10], refSeq); SAMOptionalField optField = null; string message; for (int i = optionalTokenStartingIndex; i < tokens.Length; i++) { optField = new SAMOptionalField(); if (!Helper.IsValidRegexValue(OptionalFieldRegex, tokens[i])) { message = string.Format(CultureInfo.CurrentCulture, Properties.Resource.InvalidOptionalField, tokens[i]); throw new FormatException(message); } string[] opttokens = tokens[i].Split(colonDelim, StringSplitOptions.RemoveEmptyEntries); optField.Tag = opttokens[0]; optField.VType = opttokens[1]; optField.Value = opttokens[2]; alignedSeq.OptionalFields.Add(optField); } return(alignedSeq); }
/// <summary> /// Construct a calculator with selected distance function /// /// A distance function is assigned to the class and it is /// read-only for a given set of input sequences. /// </summary> /// <param name="kmerLength">positive integer kmer length</param> /// <param name="alphabetType">molecule type: DNA, RNA or Protein</param> /// <param name="DistanceFunctionName">DistanceFunctionTypes member</param> public KmerDistanceScoreCalculator(int kmerLength, IAlphabet alphabetType, DistanceFunctionTypes DistanceFunctionName) { if (kmerLength <= 0) { throw new ArgumentException("Kmer length needs to be positive"); } _kmerLength = kmerLength; if (alphabetType is DnaAlphabet) { _numberOfPossibleKmers = (int)Math.Pow(15, _kmerLength); } else if (alphabetType is RnaAlphabet) { _numberOfPossibleKmers = (int)Math.Pow(15, _kmerLength); } else if (alphabetType is ProteinAlphabet) { _numberOfPossibleKmers = (int)Math.Pow(25, _kmerLength); } else { throw new Exception("Invalid molecular type"); } switch (DistanceFunctionName) { case (DistanceFunctionTypes.EuclideanDistance): _distanceFunction = new DistanceFunctionSelector(EuclideanDistance); break; case (DistanceFunctionTypes.CoVariance): _distanceFunction = new DistanceFunctionSelector(CoVariance); break; case (DistanceFunctionTypes.PearsonCorrelation): _distanceFunction = new DistanceFunctionSelector(PearsonCorrelation); break; case (DistanceFunctionTypes.ModifiedMUSCLE): _distanceFunction = new DistanceFunctionSelector(ModifiedMUSCLE); break; default: throw new ArgumentException("Similarity Function Name is not in the list..."); } }
/// <summary> /// The execution method for the activity. /// </summary> protected override ISequence Execute(CodeActivityContext context) { string alphaName = (AlphabetName ?? DefaultAlphabet).ToLowerInvariant(); IAlphabet alphabet = Alphabets.All.FirstOrDefault(a => a.Name.ToLowerInvariant() == alphaName); if (alphabet == null) { throw new ArgumentException("Unknown alphabet name"); } // Generate the sequence return(new Sequence(alphabet, SequenceData.Get(context)) { ID = this.ID }); }
/// <summary> /// Returns "DNA", "RNA", "Protein", or null. /// </summary> /// <param name="alphabet"></param> /// <returns></returns> private string GetGenericTypeString(IAlphabet alphabet) { if (alphabet == Alphabets.DNA) { return("DNA"); } if (alphabet == Alphabets.RNA) { return("RNA"); } if (alphabet == Alphabets.Protein) { return("Protein"); } return(null); }
private void InitializeCryptoComponents() { _alphabet = new CharactersAlphabet(); _avaibleKeys = new ObservableCollection <int>(); EncryptCommand = new RelayCommand(EncryptMessage, CanEncrypt); DecryptCommand = new RelayCommand(DecryptMessage, CanDecrypt); GetAvaibleKeys(); if (AvaibleKeys != null && AvaibleKeys.Count > 0) { SelectedKey = AvaibleKeys[(int)(_avaibleKeys.Count / 2)]; } _provider = new AffineCipher(_alphabet, _key as AffineKey); }
/// <summary> /// Adds consensus to the alignment result. At this point, it is a very simple algorithm /// which puts an ambiguity character where the two aligned sequences do not match. /// Uses X and N for protein and DNA/RNA alignments, respectively. /// </summary> /// <param name="alignment"> /// Alignment to which to add the consensus. This is the result returned by the main Align /// or AlignSimple method, which contains the aligned sequences but not yet a consensus sequence. /// </param> private void AddSimpleConsensusToResult(PairwiseAlignedSequence alignment) { ISequence seq0 = alignment.FirstSequence; ISequence seq1 = alignment.SecondSequence; byte[] consensus = new byte[seq0.Count]; for (int i = 0; i < seq0.Count; i++) { consensus[i] = ConsensusResolver.GetConsensus( new byte[] { seq0[i], seq1[i] }); } IAlphabet consensusAlphabet = Alphabets.AutoDetectAlphabet(consensus, 0, consensus.GetLongLength(), seq0.Alphabet); alignment.Consensus = new Sequence(consensusAlphabet, consensus, false); }
public void ValidateDerivedSequenceToString() { ISequence seqSmall = new Sequence(Alphabets.DNA, "ATCG"); string seqLargeStr = this.utilityObj.xmlUtil.GetTextValue(Constants.ToStringNodeName, Constants.seqLargeStringNode); ISequence seqLarge = new Sequence(Alphabets.DNA, seqLargeStr); ISequence DeriveSeqSmall = new DerivedSequence(seqSmall, false, true); ISequence DeriveSeqLarge = new DerivedSequence(seqLarge, false, true); string ActualSmallString = DeriveSeqSmall.ToString(); string ActualLargeString = DeriveSeqLarge.ToString(); string ExpectedSmallString = "TAGC"; string seqLargeExpected = this.utilityObj.xmlUtil.GetTextValue(Constants.ToStringNodeName, Constants.seqLargeExpectedNode); string expectedLargeString = string.Format(CultureInfo.CurrentCulture, seqLargeExpected, (seqLarge.Count - Helper.AlphabetsToShowInToString)); Assert.AreEqual(ExpectedSmallString, ActualSmallString); Assert.AreEqual(expectedLargeString, ActualLargeString); //read sequences from file // Get input and expected values from xml string expectedSequence = this.utilityObj.xmlUtil.GetTextValue( Constants.ProteinDerivedSequenceNode, Constants.ExpectedSequence); string alphabetName = this.utilityObj.xmlUtil.GetTextValue( Constants.ProteinDerivedSequenceNode, Constants.AlphabetNameNode); IAlphabet alphabet = Utility.GetAlphabet(alphabetName); // Create derived Sequence ISequence seq = new Sequence(alphabet, expectedSequence); var derSequence = new DerivedSequence(seq, false, false); string actualDerivedSeqStr = derSequence.ToString(); if (actualDerivedSeqStr.Length > Helper.AlphabetsToShowInToString) { //check if the whole sequence string contains the string retrieved from ToString Assert.IsTrue( expectedSequence.Contains(derSequence.ToString().Substring(0, Helper.AlphabetsToShowInToString))); Assert.IsTrue(derSequence.ToString().Contains("... +[")); } else { Assert.AreEqual(expectedSequence, derSequence.ToString()); } }
/// <summary> /// Validates the NUCmer align method for several test cases for the parameters passed. /// </summary> /// <param name="nodeName">Node name to be read from xml</param> private void ValidateNUCmerAlignSimpleGeneralTestCases(string nodeName) { // Gets the reference & search sequences from the configuration file string[] referenceSequences = this.utilityObj.xmlUtil.GetTextValues(nodeName, Constants.ReferenceSequencesNode); string[] searchSequences = this.utilityObj.xmlUtil.GetTextValues(nodeName, Constants.SearchSequencesNode); IAlphabet seqAlphabet = Utility.GetAlphabet(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.AlphabetNameNode)); var refSeqList = referenceSequences.Select(t => new Sequence(seqAlphabet, Encoding.ASCII.GetBytes(t))).Cast <ISequence>().ToList(); var searchSeqList = searchSequences.Select(t => new Sequence(seqAlphabet, Encoding.ASCII.GetBytes(t))).Cast <ISequence>().ToList(); // Gets the mum length from the xml string mumLength = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMAlignLengthNode); var nucmerObj = new NucmerPairwiseAligner { MaximumSeparation = int.Parse(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMAlignLengthNode), null), MinimumScore = int.Parse(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMAlignLengthNode), null), SeparationFactor = int.Parse(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMAlignLengthNode), null), BreakLength = int.Parse(this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.MUMAlignLengthNode), null), LengthOfMUM = long.Parse(mumLength, null) }; IList <ISequence> seqList = refSeqList.ToList(); foreach (ISequence seq in searchSeqList) { seqList.Add(seq); } IList <ISequenceAlignment> alignSimple = nucmerObj.AlignSimple(seqList); string expectedSequences = this.utilityObj.xmlUtil.GetTextValue(nodeName, Constants.ExpectedSequencesNode); string[] expSeqArray = expectedSequences.Split(','); int j = 0; // Gets all the aligned sequences in comma separated format foreach (PairwiseAlignedSequence alignedSeq in alignSimple.Cast <IPairwiseSequenceAlignment>().SelectMany(seqAlignment => seqAlignment)) { Assert.AreEqual(expSeqArray[j], alignedSeq.FirstSequence.ConvertToString()); ++j; Assert.AreEqual(expSeqArray[j], alignedSeq.SecondSequence.ConvertToString()); j++; } ApplicationLog.WriteLine("NUCmer P2 : Successfully validated all the aligned sequences."); }
public SubstringSearchBoyerMoore(string pat, IAlphabet alphabet) { this.pat = pat; this.alphabet = alphabet; int M = pat.Length; int R = alphabet.R; right = new int[R]; for (int r = 0; r < R; r++) { right[r] = -1; } for (int j = 0; j < M; j++) { right[alphabet.ToIndex(pat[j])] = j; } }
/// <summary> /// Parses a single GFF text from a reader into a sequence. /// </summary> /// <param name="mbfReader">A reader for a biological sequence text.</param> /// <param name="isReadOnly"> /// Flag to indicate whether the resulting sequence should be in readonly mode or not. /// If this flag is set to true then the resulting sequence's isReadOnly property /// will be set to true, otherwise it will be set to false. /// </param> /// <returns>A new Sequence instance containing parsed data.</returns> protected override ISequence ParseOneWithSpecificFormat(MBFTextReader mbfReader, bool isReadOnly) { if (mbfReader == null) { throw new ArgumentNullException("mbfReader"); } _isSingleSeqGff = true; _sequences = new List <Sequence>(); _sequencesInHeader = new List <Sequence>(); IAlphabet alphabet = Alphabet; if (alphabet == null) { alphabet = Alphabets.DNA; } if (Encoding == null) { _commonSeq = new Sequence(alphabet); } else { _commonSeq = new Sequence(alphabet, Encoding, string.Empty); } // The GFF spec says that all headers need to be at the top of the file. ParseHeaders(mbfReader); ParseFeatures(mbfReader); CopyMetadata(isReadOnly); if (_isSingleSeqGff) { if (_sequences.Count > 1) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.UnexpectedSecondSequenceName, mbfReader.LocationString); Trace.Report(message); throw new InvalidOperationException(message); } } return(_sequences[0]); }
/// <summary> /// Creates a SparseSequence with no sequence data. /// /// Count property of SparseSequence instance created by using this constructor will be /// set a value specified by size parameter. /// /// For working with sequences that never have sequence data, but are /// only used for metadata storage (like keeping an ID or various features /// but no direct sequence data) consider using the VirtualSequence /// class instead. /// </summary> /// <param name="alphabet"> /// The alphabet the sequence uses (e.g.. Alphabets.DNA or Alphabets.RNA or Alphabets.Protein) /// </param> /// <param name="size">A value indicating the size of this sequence.</param> public SparseSequence(IAlphabet alphabet, int size) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (size < 0) { throw new ArgumentOutOfRangeException(Properties.Resource.ParameterNameSize, Properties.Resource.ParameterMustNonNegative); } Count = size; Alphabet = alphabet; Statistics = new SequenceStatistics(alphabet); }
/// <summary> /// Convert digits to char Array. /// </summary> /// <param name="codedDigitMessage">Message represented in digit format</param> /// <param name="alphabet">Used Crypto Alphabet</param> /// <returns></returns> public static string[] ConvertDigitsToChar(List <int[]> codedDigitMessage, IAlphabet alphabet) { List <string> codedMessage = new List <string>(); for (int i = 0; i < codedDigitMessage.Count; i++) { char[] codedLineMessage = new char[codedDigitMessage[i].Length]; for (int j = 0; j < codedDigitMessage[i].Length; j++) { codedLineMessage[j] = alphabet.GetSymbol(codedDigitMessage[i][j]); } codedMessage.Add(new string(codedLineMessage)); } return(codedMessage.ToArray()); }
public void ValidateAutoDetectAlphabet() { string alphabetName = utilityObj.xmlUtil.GetTextValue( Constants.DnaDerivedSequenceNode, Constants.AlphabetNameNode); string dnaSequence = utilityObj.xmlUtil.GetTextValue( Constants.DnaDerivedSequenceNode, Constants.ExpectedDerivedSequence); byte[] dnaArray = encodingObj.GetBytes(dnaSequence); //Validating for Dna. IAlphabet dnaAplhabet = Alphabets.AutoDetectAlphabet(dnaArray, 0, 4, null); Assert.AreEqual(dnaAplhabet.Name, alphabetName); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Auto Detect method for Dna completed successfully.")); //Validating for Rna. alphabetName = ""; alphabetName = utilityObj.xmlUtil.GetTextValue( Constants.RnaDerivedSequenceNode, Constants.AlphabetNameNode); string rnaSequence = utilityObj.xmlUtil.GetTextValue( Constants.RnaDerivedSequenceNode, Constants.ExpectedDerivedSequence); byte[] rnaArray = encodingObj.GetBytes(rnaSequence); IAlphabet rnaAplhabet = Alphabets.AutoDetectAlphabet(rnaArray, 0, 4, null); Assert.AreEqual(rnaAplhabet.Name, alphabetName); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Auto Detect method for Rna completed successfully.")); //Validating for Protein. alphabetName = ""; alphabetName = utilityObj.xmlUtil.GetTextValue( Constants.ProteinDerivedSequenceNode, Constants.AlphabetNameNode); string proteinSequence = utilityObj.xmlUtil.GetTextValue( Constants.ProteinDerivedSequenceNode, Constants.ExpectedDerivedSequence); byte[] proteinArray = encodingObj.GetBytes(proteinSequence); IAlphabet proteinAplhabet = Alphabets.AutoDetectAlphabet(proteinArray, 0, 4, null); Assert.AreEqual(proteinAplhabet.Name, alphabetName); ApplicationLog.WriteLine(string.Concat( "Alphabets BVT: Validation of Auto Detect method for Protein completed successfully.")); }
public void TestDnaAlphabetTryGetAmbiguousSymbols() { byte basicSymbol; IAlphabet alphabet = AmbiguousDnaAlphabet.Instance; Assert.AreEqual(true, alphabet.TryGetAmbiguousSymbol(new HashSet <byte>() { (byte)'A', (byte)'C' }, out basicSymbol)); Assert.IsTrue(basicSymbol == (byte)'M'); alphabet = AmbiguousRnaAlphabet.Instance; Assert.AreEqual(true, alphabet.TryGetAmbiguousSymbol(new HashSet <byte>() { (byte)'U', (byte)'C' }, out basicSymbol)); Assert.IsTrue(basicSymbol == (byte)'Y'); }
public HillCipher(int[,] key, IAlphabet alphabet) { if (key == null) throw new ArgumentNullException("Key provided is null."); if (key.GetLength(0) != 2 || key.GetLength(1) != 2) throw new ArgumentException("Provided key dimensions ({key.GetLength(0)} by {key.GetLength(1)}) are not applicable (Should be 2 by 2)."); _alphabet = alphabet ?? throw new ArgumentNullException("Provided alphabet is null."); int size = alphabet.Size; int det = _determinant(key); if (_gcd(size, det) != 1) throw new ArgumentException("Provided key's determinant and the alphabet size aren't relatively prime."); _key = key; }
/// <summary> /// Looks up the default encoding map for known alphabets to the default /// encoding for that alphabet. Several encodings may exist for any one /// particular alphabet. If you want to select a particular encoding, /// consider using the GetMapToEncoding() method. /// </summary> public static EncodingMap GetDefaultMap(IAlphabet alphabet) { if (alphabet == Alphabets.DNA) { return(DnaToNcbi4NA); } else if (alphabet == Alphabets.RNA) { return(RnaToNcbi4NA); } else if (alphabet == Alphabets.Protein) { return(ProteinToNcbiStdAA); } Trace.Report(Resource.ParameterContainsNullValue); throw new ArgumentNullException(Resource.ParameterNameAlphabet); }
public void ValidateSequenceLastIndexOfNonGap() { // Get input and expected values from xml string expectedSequence = this.utilityObj.xmlUtil.GetTextValue(Constants.RnaDerivedSequenceNode, Constants.ExpectedSequence); string alphabetName = this.utilityObj.xmlUtil.GetTextValue(Constants.RnaDerivedSequenceNode, Constants.AlphabetNameNode); IAlphabet alphabet = Utility.GetAlphabet(alphabetName); // Create a Sequence object. Sequence seqObj = new Sequence(alphabet, expectedSequence); long index = seqObj.LastIndexOfNonGap(); Assert.AreEqual(expectedSequence.Length - 1, index); }
/// <summary> /// Returns an instance of PatternConverter /// </summary> public static IPatternConverter GetInstanace(IAlphabet alphabetSet) { IPatternConverter patternConverter = null; if (!_patternConverter.TryGetValue(alphabetSet, out patternConverter)) { lock (_patternConverter) { if (!_patternConverter.TryGetValue(alphabetSet, out patternConverter)) { patternConverter = new PatternConverter(alphabetSet); _patternConverter.Add(alphabetSet, patternConverter); } } } return(patternConverter); }
/// <summary> /// Parsers the files binary content into a abi parser context using /// the specified alphabet. /// </summary> /// <param name="reader"></param> /// <param name="alphabet"></param> /// <returns></returns> public static IParserContext Parse(BinaryReader reader, IAlphabet alphabet) { // Default to the DNA alphabet if (alphabet == null) { alphabet = Alphabets.DNA; } var rawData = new Ab1Header(reader); IVersionedDataParser dataParser = DataParserFactory.GetParser(rawData.MajorVersion); var context = new ParserContext { Header = rawData, Reader = reader, Alphabet = alphabet, }; dataParser.ParseData(context); return(context); }
/// <summary> /// Constructs sequence statistics by iterating through a sequence. /// </summary> /// <param name="sequence">The sequence to construct statistics for.</param> internal SequenceStatistics(ISequence sequence) { _alphabet = sequence.Alphabet; // Counting with an array is way faster than using a dictionary. int[] symbolCounts = new int[256]; foreach (ISequenceItem item in sequence) { if (item == null) { continue; } symbolCounts[item.Symbol]++; } LoadFromIntArray(symbolCounts); }
/// <summary> /// Creates a sparse sequence and inserts sequence items of alphabet. /// Delete all sequence items using Clear() method and /// validates if all items are deleted from sparse sequence object. /// </summary> /// <param name="alphabet">alphabet instance.</param> private void ValidateSparseSequenceClear(IAlphabet alphabet) { SparseSequence sparseSeq = CreateSparseSequence(alphabet, 10); sparseSeq.IsReadOnly = false; // Validate if sparse sequence conatins all sequence items. Assert.AreEqual(alphabet.Count + 10, sparseSeq.Count); // Clear the sparse sequence. sparseSeq.Clear(); // Validate if all sequence items are deleted. Assert.AreEqual(0, sparseSeq.Count); Console.WriteLine("SparseSequenceP1: Validation of Clear() method is completed"); ApplicationLog.WriteLine("SparseSequenceP1: Validation of Clear() method is completed"); }
void ValidateGetSymbolValueMap(AlphabetsTypes option) { IAlphabet alphabetInstance = null; byte[] queryReference = null; byte inputByte1 = 0, inputByte2 = 0, outputByte1 = 0, outputByte2 = 0; switch (option) { case AlphabetsTypes.Protein: alphabetInstance = ProteinAlphabet.Instance; inputByte1 = (byte)'w'; outputByte1 = (byte)'W'; inputByte2 = (byte)'e'; outputByte2 = (byte)'E'; break; case AlphabetsTypes.Rna: alphabetInstance = RnaAlphabet.Instance; inputByte1 = (byte)'a'; outputByte1 = (byte)'A'; inputByte2 = (byte)'u'; outputByte2 = (byte)'U'; break; case AlphabetsTypes.Dna: alphabetInstance = DnaAlphabet.Instance; inputByte1 = (byte)'a'; outputByte1 = (byte)'A'; inputByte2 = (byte)'t'; outputByte2 = (byte)'T'; break; } byte output = 0; queryReference = alphabetInstance.GetSymbolValueMap(); output = queryReference[inputByte1]; Assert.AreEqual(outputByte1, output); output = queryReference[inputByte2]; Assert.AreEqual(outputByte2, output); ApplicationLog.WriteLine(string.Concat(@"Alphabets BVT: Validation of GetSymbolValueMap method for ", option, " completed successfully.")); }
/// <summary> /// Parses a range of sequence items starting from the specified index in the sequence. /// </summary> /// <param name="startIndex">The zero-based index at which to begin parsing.</param> /// <param name="count">The number of symbols to parse.</param> /// <param name="seqPointer">The sequence pointer of that sequence.</param> /// <returns>The parsed sequence.</returns> public ISequence ParseRange(int startIndex, int count, SequencePointer seqPointer) { if (string.IsNullOrEmpty(_fileName)) { throw new NotSupportedException(Resource.DataVirtualizationNeedsInputFile); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (count <= 0) { throw new ArgumentOutOfRangeException("count"); } IAlphabet alphabet = Alphabets.All.Single(A => A.Name.Equals(seqPointer.AlphabetName)); Sequence sequence = new Sequence(alphabet) { IsReadOnly = false }; int start = (int)seqPointer.StartingIndex + startIndex; if (start >= seqPointer.EndingIndex) { return(null); } int includesNewline = seqPointer.StartingLine * Environment.NewLine.Length; int len = (int)(seqPointer.EndingIndex - seqPointer.StartingIndex); using (BioTextReader bioReader = new BioTextReader(_fileName)) { string str = bioReader.ReadBlock(startIndex, seqPointer.StartingIndex + includesNewline, count, len); sequence.InsertRange(0, str); } // default for partial load sequence.IsReadOnly = true; return(sequence); }
/// <summary> /// Constructor for deserialization. /// </summary> /// <param name="info">Serialization Info.</param> /// <param name="context">Streaming context.</param> protected SequenceStatistics(SerializationInfo info, StreamingContext context) { if (info == null) { throw new ArgumentNullException("info"); } _alphabet = Alphabets.All.Single(A => A.Name.Equals(info.GetString("AN"))); if (info.GetBoolean("CHP")) { _countHash = (Dictionary <char, int>)info.GetValue("CH", typeof(Dictionary <char, int>)); } else { _countHash = new Dictionary <char, int>(); } _totalCount = info.GetDouble("TC"); }
/// <summary> /// Creates a sparse sequence and inserts sequence items of alphabet /// and removes few sequence items using RemoveRange() /// Validates ifexpected number of items are removed. /// </summary> /// <param name="alphabet">alphabet instance.</param> private void ValidateSparseSequenceRemoveRange(IAlphabet alphabet) { SparseSequence sparseSequence = CreateSparseSequence(alphabet, 10); sparseSequence.IsReadOnly = false; // Remove all sequence items Assert.AreEqual(alphabet.Count + 10, sparseSequence.Count); sparseSequence.RemoveRange(10, 10); // Validate if 10 items are removed using RemoveRange Assert.AreEqual(alphabet.Count, sparseSequence.Count); Console.WriteLine( "SparseSequenceP1: Validation of RemoveRange() method by passing position and length is completed"); ApplicationLog.WriteLine( "SparseSequenceP1: Validation of RemoveRange() method by passing position and length is completed"); }
/// <summary> /// Constructs sequence statistics by iterating through a sequence. /// </summary> /// <param name="sequence">The sequence to construct statistics for.</param> public SequenceStatistics(ISequence sequence) { if (sequence == null) { throw new ArgumentNullException("sequence"); } this.alphabet = sequence.Alphabet; // Counting with an array is way faster than using a dictionary. long[] symbolCounts = new long[256]; foreach (byte item in sequence) { if (item == 0) { continue; } symbolCounts[item]++; } LoadFromLongArray(symbolCounts); }
/// <summary> /// Initializes a new instance of the MultiWaySuffixTree class with the specified sequence. /// </summary> /// <param name="sequence">Sequence to build the suffix tree.</param> public MultiWaySuffixTree(ISequence sequence) { if (sequence == null) { throw new ArgumentNullException("sequence"); } if (sequence.Count == 0) { throw new ArgumentOutOfRangeException("sequence", Resource.EmptySequence); } byte[] aliasMap = sequence.Alphabet.GetSymbolValueMap(); this.uniqueSymbolsInReference = new HashSet<byte>(); this.uniqueSymbolsStartIndexes = new long[byte.MaxValue + 1]; var convertedValeus = new byte[sequence.Count]; for (int index = 0; index < sequence.Count; index++) { byte symbol = aliasMap[sequence[index]]; if (!this.uniqueSymbolsInReference.Contains(symbol)) { this.uniqueSymbolsStartIndexes[symbol] = index; this.uniqueSymbolsInReference.Add(symbol); } convertedValeus[index] = symbol; } this.Sequence = sequence; this.referenceSequence = new Sequence(sequence.Alphabet, convertedValeus, false); this.symbolsCount = sequence.Count; this.Name = Resource.MultiWaySuffixTreeName; this.MinLengthOfMatch = 20; this.NoAmbiguity = false; // Create root edge. this.rootEdge = new MultiWaySuffixEdge(); this.edgesCount++; this.supportedBaseAlphabet = sequence.Alphabet; IAlphabet alphabet; while (Alphabets.AlphabetToBaseAlphabetMap.TryGetValue(this.supportedBaseAlphabet, out alphabet)) { this.supportedBaseAlphabet = alphabet; } // Build the suffix tree. this.BuildSuffixTree(); // Update tree with suffixLinks. this.UpdateSuffixLinks(); }
/// <summary> /// Returns an IEnumerable of sequences in the stream being parsed. /// </summary> /// <param name="reader">Stream to parse.</param> /// <param name="buffer">Buffer to use.</param> /// <returns>Returns a Sequence.</returns> ISequence ParseOne(TextReader reader, byte[] buffer) { if (reader == null) throw new ArgumentNullException("reader"); if (reader.Peek() == -1) return null; int currentBufferSize = PlatformManager.Services.DefaultBufferSize; string message; string line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line)) { line = reader.ReadLine(); } if (line == null || !line.StartsWith(">", StringComparison.OrdinalIgnoreCase)) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.INVALID_INPUT_FILE, Properties.Resource.FASTA_NAME); throw new Exception(message); } string name = line.Substring(1); int bufferPosition = 0; // Read next line. line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line)) { line = reader.ReadLine(); } if (line == null) { message = string.Format( CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, string.Empty); throw new Exception(message); } IAlphabet alphabet = Alphabet; bool tryAutoDetectAlphabet = alphabet == null; do { // Files > 2G are not supported in this release. if ((((long)bufferPosition + line.Length) >= PlatformManager.Services.MaxSequenceSize)) { throw new ArgumentOutOfRangeException( string.Format(CultureInfo.CurrentUICulture, Properties.Resource.SequenceDataGreaterthan2GB, name)); } int neededSize = bufferPosition + line.Length; if (neededSize >= currentBufferSize) { //Grow file dynamically, by buffer size, or if too small to fit the new sequence by the size of the sequence int suggestedSize = buffer.Length + PlatformManager.Services.DefaultBufferSize; int newSize = neededSize < suggestedSize ? suggestedSize : neededSize; Array.Resize(ref buffer, newSize); currentBufferSize =newSize; } byte[] symbols = Encoding.UTF8.GetBytes(line); // Array.Copy -- for performance improvement. Array.Copy(symbols, 0, buffer, bufferPosition, symbols.Length); // Auto detect alphabet if alphabet is set to null, else validate with already set alphabet if (tryAutoDetectAlphabet) { // If we have a base alphabet we detected earlier, // then try that first. if (this.baseAlphabet != null && this.baseAlphabet.ValidateSequence(buffer, bufferPosition, line.Length)) { alphabet = this.baseAlphabet; } // Otherwise attempt to identify alphabet else { // Different alphabet - try to auto detect. this.baseAlphabet = null; alphabet = Alphabets.AutoDetectAlphabet(buffer, bufferPosition, bufferPosition + line.Length, alphabet); if (alphabet == null) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line)); } } // Determine the base alphabet used. if (this.baseAlphabet == null) { this.baseAlphabet = alphabet; } else { // If they are not the same, then this might be an error. if (this.baseAlphabet != alphabet) { // If the new alphabet includes all the base alphabet then use it instead. // This happens when we hit an ambiguous form of the alphabet later in the file. if (!this.baseAlphabet.HasAmbiguity && Alphabets.GetAmbiguousAlphabet(this.baseAlphabet) == alphabet) { this.baseAlphabet = alphabet; } else if (alphabet.HasAmbiguity || Alphabets.GetAmbiguousAlphabet(alphabet) != this.baseAlphabet) { throw new Exception(Properties.Resource.FastAContainsMorethanOnebaseAlphabet); } } } } else { // Validate against supplied alphabet. if (!alphabet.ValidateSequence(buffer, bufferPosition, line.Length)) { throw new Exception(string.Format(CultureInfo.InvariantCulture, Properties.Resource.InvalidSymbolInString, line)); } } bufferPosition += line.Length; if (reader.Peek() == (byte)'>') { break; } // Read next line. line = reader.ReadLine(); // Continue reading if blank line found. while (line != null && string.IsNullOrEmpty(line) && reader.Peek() != (byte)'>') { line = reader.ReadLine(); } } while (line != null); // Truncate buffer to remove trailing 0's byte[] tmpBuffer = new byte[bufferPosition]; Array.Copy(buffer, tmpBuffer, bufferPosition); if (tryAutoDetectAlphabet) { alphabet = this.baseAlphabet; } // In memory sequence return new Sequence(alphabet, tmpBuffer, false) {ID = name}; }
// Returns a sequence corresponding to the given sequence name, setting its display // ID if it has not yet been set. If parsing for single sequence and already a sequence is exist and it // has already been assigned a display ID that doesn't match sequenceName, and exception // is thrown. private Tuple<ISequence, List<byte>> GetSpecificSequence(string sequenceName, IAlphabet alphabetType, bool isSeqInFeature = true) { if (alphabetType == null) { alphabetType = Alphabets.DNA; } Tuple<ISequence, List<byte>> seq = null; if (!isSeqInFeature) { // Sequence is referred in header. seq = this.sequencesInHeader.FirstOrDefault(S => S.Item1.ID.Equals(sequenceName)); if (seq != null) { return seq; } seq = new Tuple<ISequence, List<byte>>( new Sequence(alphabetType, string.Empty) { ID = sequenceName }, new List<byte>()); this.sequencesInHeader.Add(seq); } else { if (this.sequencesInHeader.Count > 0) { seq = this.sequencesInHeader.FirstOrDefault(S => S.Item1.ID.Equals(sequenceName)); if (seq != null) { this.sequencesInHeader.Remove(seq); this.sequences.Add(seq); } } if (this.sequences.Count == 0) { seq = new Tuple<ISequence, List<byte>>( new Sequence(alphabetType, string.Empty) { ID = sequenceName }, new List<byte>()); this.sequences.Add(seq); } else if (seq == null) { seq = this.sequences.FirstOrDefault(S => S.Item1.ID.Equals(sequenceName)); if (seq == null) { seq = new Tuple<ISequence, List<byte>>( new Sequence(alphabetType, string.Empty) { ID = sequenceName }, new List<byte>()); this.sequences.Add(seq); } } } return seq; }
/// <summary> /// Initializes a new instance of the QualitativeSequence class with specified alphabet, quality score type, /// string representing symbols and encoded quality scores. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="fastQFormatType">FastQ format type.</param> /// <param name="sequence">A string representing the symbols.</param> /// <param name="encodedQualityScores">A string representing the encoded quality scores.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public QualitativeSequence(IAlphabet alphabet, FastQFormatType fastQFormatType, string sequence, string encodedQualityScores, bool validate) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } this.Alphabet = alphabet; this.ID = string.Empty; if (sequence == null) { throw new ArgumentNullException("sequence"); } if (encodedQualityScores == null) { throw new ArgumentNullException("encodedQualityScores"); } this.FormatType = fastQFormatType; this.sequenceData = UTF8Encoding.UTF8.GetBytes(sequence); byte[] encodedQualityScoresarray = UTF8Encoding.UTF8.GetBytes(encodedQualityScores); if (validate) { if (this.sequenceData.GetLongLength() != encodedQualityScoresarray.GetLongLength()) { string message = string.Format(CultureInfo.CurrentUICulture, Properties.Resource.DifferenceInSequenceAndQualityScoresLengthMessage, this.sequenceData.GetLongLength(), encodedQualityScoresarray.GetLongLength()); throw new ArgumentException(message); } // Validate sequence data if (!this.Alphabet.ValidateSequence(this.sequenceData, 0, this.sequenceData.GetLongLength())) { throw Helper.GenerateAlphabetCheckFailureException(this.Alphabet, sequenceData); } byte invalidEncodedQualityScore; // Validate quality scores if (!ValidateQualScores(encodedQualityScoresarray, this.FormatType, out invalidEncodedQualityScore)) { string message = string.Format(CultureInfo.CurrentUICulture, Properties.Resource.InvalidEncodedQualityScoreFound, (char)invalidEncodedQualityScore, this.FormatType); throw new ArgumentOutOfRangeException("encodedQualityScores", message); } } this.qualityScores = GetDecodedQualScoresInSignedBytes(encodedQualityScoresarray, this.FormatType); this.Count = this.sequenceData.GetLongLength(); }
/// <summary> /// Initializes a new instance of the QualitativeSequence class with specified alphabet, quality score type, /// string representing symbols and encoded quality scores. /// Sequence and quality scores are validated with the specified alphabet and specified fastq format respectively. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="fastQFormatType">FastQ format type.</param> /// <param name="sequence">A string representing the symbols.</param> /// <param name="encodedQualityScores">A string representing the encoded quality scores.</param> public QualitativeSequence(IAlphabet alphabet, FastQFormatType fastQFormatType, string sequence, string encodedQualityScores) : this(alphabet, fastQFormatType, sequence, encodedQualityScores, true) { }
/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and string sequence. /// Symbols in the sequence are validated with the specified alphabet. /// </summary> /// <param name="alphabet">Alphabet to which this class should conform.</param> /// <param name="sequence">The sequence in string form.</param> public Sequence(IAlphabet alphabet, string sequence) : this(alphabet, sequence, true) { }
/// <summary> /// This method assigns the alphabet from the input sequences /// </summary> /// <param name="sequences">Input sequences</param> /// <param name="similarityMatrix">Matrix to use for similarity comparisons</param> /// <param name="fixSimilarityMatrixErrors">True to fix any similarity matrix issue related to the alphabet.</param> private void SetAlphabet(IList<ISequence> sequences, SimilarityMatrix similarityMatrix, bool fixSimilarityMatrixErrors) { if (sequences.Count == 0) { throw new ArgumentException("Empty input sequences"); } // Validate data type this.alphabet = Alphabets.GetAmbiguousAlphabet(sequences[0].Alphabet); Parallel.For(1, sequences.Count, ParallelOption, i => { if (!Alphabets.CheckIsFromSameBase(sequences[i].Alphabet, this.alphabet)) { throw new ArgumentException("Inconsistent sequence alphabet"); } }); SimilarityMatrix bestSimilarityMatrix = null; if (this.alphabet is DnaAlphabet) { bestSimilarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna); } else if (this.alphabet is RnaAlphabet) { bestSimilarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna); } else if (this.alphabet is ProteinAlphabet) { bestSimilarityMatrix = new SimilarityMatrix(SimilarityMatrix.StandardSimilarityMatrix.Blosum50); } // Check or assign the similarity matrix. if (similarityMatrix == null) { SimilarityMatrix = bestSimilarityMatrix; if (SimilarityMatrix == null) throw new ArgumentException("Unknown alphabet - could not choose SimilarityMatrix."); } else { var similarityMatrixDNA = new List<String> { "AmbiguousDNA" }; var similarityMatrixRNA = new List<String> { "AmbiguousRNA" }; var similarityMatrixProtein = new List<String> { "BLOSUM45", "BLOSUM50", "BLOSUM62", "BLOSUM80", "BLOSUM90", "PAM250", "PAM30", "PAM70" }; if (this.alphabet is DnaAlphabet) { if (!similarityMatrixDNA.Contains(similarityMatrix.Name)) { if (fixSimilarityMatrixErrors) SimilarityMatrix = bestSimilarityMatrix; else throw new ArgumentException("Inappropriate Similarity Matrix for DNA."); } } else if (this.alphabet is ProteinAlphabet) { if (!similarityMatrixProtein.Contains(similarityMatrix.Name)) { if (fixSimilarityMatrixErrors) SimilarityMatrix = bestSimilarityMatrix; else throw new ArgumentException("Inappropriate Similarity Matrix for Protein."); } } else if (this.alphabet is RnaAlphabet) { if (!similarityMatrixRNA.Contains(similarityMatrix.Name)) { if (fixSimilarityMatrixErrors) SimilarityMatrix = bestSimilarityMatrix; else throw new ArgumentException("Inappropriate Similarity Matrix for RNA."); } } else { throw new ArgumentException("Invalid alphabet"); } } }
/// <summary> /// Creates the sequence object with sequences in different cases /// </summary> /// <param name="firstSequenceString">First sequence string.</param> /// <param name="secondSequenceString">Second sequence string.</param> /// <param name="alphabet">alphabet type.</param> /// <param name="caseType">Sequence case type</param> /// <param name="firstInputSequence">First input sequence object.</param> /// <param name="secondInputSequence">Second input sequence object.</param> private static void GetSequenceWithCaseType(string firstSequenceString, string secondSequenceString, IAlphabet alphabet, SequenceCaseType caseType, out Sequence firstInputSequence, out Sequence secondInputSequence) { switch (caseType) { case SequenceCaseType.LowerCase: firstInputSequence = new Sequence(alphabet, firstSequenceString.ToString(null) .ToLower(CultureInfo.CurrentCulture)); secondInputSequence = new Sequence(alphabet, secondSequenceString.ToString(null) .ToLower(CultureInfo.CurrentCulture)); break; case SequenceCaseType.UpperCase: firstInputSequence = new Sequence(alphabet, firstSequenceString.ToString(null) .ToUpper(CultureInfo.CurrentCulture)); secondInputSequence = new Sequence(alphabet, secondSequenceString.ToString(null) .ToUpper(CultureInfo.CurrentCulture)); break; case SequenceCaseType.LowerUpperCase: firstInputSequence = new Sequence(alphabet, firstSequenceString.ToString(null) .ToLower(CultureInfo.CurrentCulture)); secondInputSequence = new Sequence(alphabet, secondSequenceString.ToString(null) .ToUpper(CultureInfo.CurrentCulture)); break; case SequenceCaseType.Default: default: firstInputSequence = new Sequence(alphabet, firstSequenceString.ToString(null)); secondInputSequence = new Sequence(alphabet, secondSequenceString.ToString(null)); break; } }
/// <summary> /// Assemble the input sequences into the largest possible contigs. /// </summary> /// <remarks> /// The algorithm is: /// 1. initialize list of contigs to empty list. List of seqs is passed as argument. /// 2. compute pairwise overlap scores for each pair of input seqs (with reversal and /// complementation as appropriate). /// 3. choose best overlap score. the “merge items” (can be seqs or contigs) are the /// items with that score. If best score is less than threshold, assembly is finished. /// 4. merge the merge items into a single contig and remove them from their list(s) /// 5. compute the overlap between new item and all existing items /// 6. go to step 3 /// </remarks> /// <param name="inputSequences">The sequences to assemble.</param> /// <returns>Returns the OverlapDeNovoAssembly instance which contains list of /// contigs and list of unmerged sequences which are result of this assembly.</returns> public IDeNovoAssembly Assemble(IEnumerable<ISequence> inputSequences) { if (null == inputSequences) { throw new ArgumentNullException(Properties.Resource.ParameterNameInputSequences); } // numbering convention: every pool item (whether sequence or contig) // gets a fixed number. // sequence index = index into inputs (which we won't modify) // contig index = nSequences + index into contigs List<PoolItem> pool = inputSequences.Select(seq => new PoolItem(seq)).ToList(); // Initialization int sequenceCount = pool.Count; if (sequenceCount > 0) { _sequenceAlphabet = pool[0].Sequence.Alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } } // put all the initial sequences into the pool, and generate the pair scores. // there are no contigs in the pool yet. // to save an iteration, we'll also find the best global score as we go. ItemScore globalBest = new ItemScore(-1, -1, false, false, 0, 0); int globalBestLargerIndex = -1; int unconsumedCount = sequenceCount; // Compute alignment scores for all combinations between input sequences // Store these scores in the poolItem corresponding to each sequence for (int newSeq = 0; newSeq < pool.Count; ++newSeq) { PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < newSeq; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); if (score.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(score); globalBestLargerIndex = newSeq; } } } // Merge sequence if best score is above threshold // and add new contig to pool if (globalBest.OverlapScore >= MergeThreshold) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine("Merging (overlap score {0}):", globalBest.OverlapScore); } PoolItem mergeItem1 = pool[globalBest.OtherItem]; PoolItem mergeItem2 = pool[globalBestLargerIndex]; Contig newContig = new Contig(); if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "new pool item {0} will merge old items {1} and {2}", pool.Count, globalBest.OtherItem, globalBestLargerIndex); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); MakeConsensus(newContig); // Set ConsumedBy value and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; while (unconsumedCount > 1) { // Compute scores for each unconsumed sequence with new contig int newSeq = pool.Count - 1; PoolItem newItem = pool[newSeq]; for (int oldSeq = 0; oldSeq < pool.Count - 1; ++oldSeq) { PoolItem oldItem = pool[oldSeq]; if (oldItem.ConsumedBy >= 0) { // already consumed - just add dummy score to maintain correct indices newItem.Scores.Add(new ItemScore()); } else { ItemScore score = AlignSequence(oldItem.SequenceOrConsensus, newItem.SequenceOrConsensus, oldSeq, newSeq); newItem.Scores.Add(score); } } // find best global score in the modified pool. globalBest = new ItemScore(-1, -1, false, false, 0, 0); globalBestLargerIndex = -1; for (int current = 0; current < pool.Count; ++current) { PoolItem curItem = pool[current]; if (curItem.ConsumedBy < 0) { for (int other = 0; other < current; ++other) { if (pool[other].ConsumedBy < 0) { ItemScore itemScore = curItem.Scores[other]; if (itemScore.OverlapScore > globalBest.OverlapScore) { globalBest = new ItemScore(itemScore); // copy the winner so far globalBestLargerIndex = current; } } } } } if (globalBest.OverlapScore >= MergeThreshold) { // Merge sequences / contigs if above threshold mergeItem1 = pool[globalBest.OtherItem]; mergeItem2 = pool[globalBestLargerIndex]; newContig = new Contig(); if (mergeItem1.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedContig(newContig, globalBest, mergeItem1.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (reversed = {1}, complemented = {2}, offset = {3}", globalBest.OtherItem, globalBest.Reversed, globalBest.Complemented, globalBest.FirstOffset); } MergeLowerIndexedSequence(newContig, globalBest, mergeItem1.Sequence); } if (mergeItem2.IsContig) { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a contig (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedContig(newContig, globalBest, mergeItem2.Contig); } else { if (Trace.Want(Trace.AssemblyDetails)) { ApplicationLog.WriteLine( "item {0} is a sequence (offset = {1}", globalBestLargerIndex, globalBest.SecondOffset); } MergeHigherIndexedSequence(newContig, globalBest, mergeItem2.Sequence); } MakeConsensus(newContig); if (Trace.Want(Trace.AssemblyDetails)) { Dump(newContig); } // Set ConsumedBy value for these poolItems and // free memory as these sequences are no longer used mergeItem1.ConsumedBy = pool.Count; mergeItem2.ConsumedBy = pool.Count; mergeItem1.FreeSequences(); mergeItem2.FreeSequences(); pool.Add(new PoolItem(newContig)); unconsumedCount--; } else { // None of the alignment scores cross threshold // No more merges possible. So end iteration. break; } } } // no further qualifying merges, so we're done. // populate contigs and unmergedSequences OverlapDeNovoAssembly sequenceAssembly = new OverlapDeNovoAssembly(); foreach (PoolItem curItem in pool) { if (curItem.ConsumedBy < 0) { if (curItem.IsContig) { sequenceAssembly.Contigs.Add(curItem.Contig); } else { sequenceAssembly.UnmergedSequences.Add(curItem.Sequence); } } } return sequenceAssembly; }
/// <summary> /// Creates a dna derived sequence after adding and removing few items from original sequence. /// </summary> /// <param name="alphabet">Alphabet</param> /// <param name="source">source sequence</param> private static DerivedSequence CreateDerivedSequence( IAlphabet alphabet, string source) { ISequence seq = new Sequence(alphabet, source); DerivedSequence derSequence = new DerivedSequence(seq, false, false); return derSequence; }
///<summary> /// Creates a contig parser that parses Contigs using the given encoding /// and alphabet, by creating an XsvSparseReader that uses the given separator /// and sequenceIdPrefix characters. ///</summary> ///<param name="alphabet">Alphabet to use for the consensus and assembled sequences that are parsed.</param> ///<param name="separatorChar">Character used to separate sequence item position and symbol in the Xsv file</param> ///<param name="sequenceIdPrefixChar">Character used at the beginning of the sequence start line.</param> public XsvContigParser(IAlphabet alphabet, char separatorChar, char sequenceIdPrefixChar) : base(alphabet, separatorChar, sequenceIdPrefixChar) { separator = separatorChar; sequenceIdPrefix = sequenceIdPrefixChar; }
/// <summary> /// Maps the alphabet to its ambiguous alphabet. /// For example: DnaAlphabet to AmbiguousDnaAlphabet. /// </summary> /// <param name="alphabet">Alphabet to map.</param> /// <param name="ambiguousAlphabet">Ambiguous alphabet to map.</param> private static void MapAlphabetToAmbiguousAlphabet(IAlphabet alphabet, IAlphabet ambiguousAlphabet) { AmbiguousAlphabetMap.Add(alphabet, ambiguousAlphabet); }
/// <summary> /// Maps the alphabet to its base alphabet. /// For example: AmbiguousDnaAlphabet to DnaAlphabet /// </summary> /// <param name="alphabet">Alphabet to map.</param> /// <param name="baseAlphabet">Base alphabet to map.</param> private static void MapAlphabetToBaseAlphabet(IAlphabet alphabet, IAlphabet baseAlphabet) { AlphabetToBaseAlphabetMap.Add(alphabet, baseAlphabet); }
public Rotor(IAlphabet leftAlphabet, IAlphabet rightAlphabet, IReflector reflector) { _leftAlphabet = leftAlphabet; _rightAlphabet = rightAlphabet; _reflector = reflector; }
/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and bytes. /// Bytes representing Symbols in the values are validated with the specified alphabet. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="values">An array of bytes representing the symbols.</param> public Sequence(IAlphabet alphabet, byte[] values) : this(alphabet, values, true) { }
/// <summary> /// Analyze the passed contig and store a consensus into its Consensus property. /// Public method to allow testing of consensus generation part. /// Used by test automation. /// </summary> /// <param name="alphabet">Sequence alphabet</param> /// <param name="contig">Contig for which consensus is to be constructed</param> public void MakeConsensus(IAlphabet alphabet, Contig contig) { _sequenceAlphabet = alphabet; if (ConsensusResolver == null) { ConsensusResolver = new SimpleConsensusResolver(_sequenceAlphabet); } else { ConsensusResolver.SequenceAlphabet = _sequenceAlphabet; } MakeConsensus(contig); }
/// <summary> /// Initializes a new instance of the Sequence class with specified alphabet and bytes. /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="values">An array of bytes representing the symbols.</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public Sequence(IAlphabet alphabet, byte[] values, bool validate) { // validate the inputs if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (values == null) { throw new ArgumentNullException("values"); } if (validate) { // Validate sequence data if (!alphabet.ValidateSequence(values, 0, values.GetLongLength())) { throw Helper.GenerateAlphabetCheckFailureException(alphabet, values); } } this._sequenceData = new byte[values.GetLongLength()]; this.ID = string.Empty; Helper.Copy(values, this._sequenceData, values.GetLongLength()); this.Alphabet = alphabet; this.Count = this._sequenceData.GetLongLength(); }
/// <summary> /// Initializes a new instance of the QualitativeSequence class with specified alphabet, quality score type, /// byte array representing symbols and integer array representing base quality scores /// (Phred or Solexa base according to the FastQ format type). /// </summary> /// <param name="alphabet">Alphabet to which this instance should conform.</param> /// <param name="fastQFormatType">FastQ format type.</param> /// <param name="sequence">An array of bytes representing the symbols.</param> /// <param name="qualityScores">An array of integers representing the base quality scores /// (Phred or Solexa base according to the FastQ format type).</param> /// <param name="validate">If this flag is true then validation will be done to see whether the data is valid or not, /// else validation will be skipped.</param> public QualitativeSequence(IAlphabet alphabet, FastQFormatType fastQFormatType, byte[] sequence, int[] qualityScores, bool validate) { if (alphabet == null) { throw new ArgumentNullException("alphabet"); } if (sequence == null) { throw new ArgumentNullException("sequence"); } if (qualityScores == null) { throw new ArgumentNullException("qualityScores"); } this.Alphabet = alphabet; this.ID = string.Empty; this.FormatType = fastQFormatType; if (validate) { if (sequence.GetLongLength() != qualityScores.GetLongLength()) { string message = string.Format(CultureInfo.CurrentUICulture, Properties.Resource.DifferenceInSequenceAndQualityScoresLengthMessage, sequence.GetLongLength(), qualityScores.GetLongLength()); throw new ArgumentException(message); } // Validate sequence data if (!this.Alphabet.ValidateSequence(sequence, 0, sequence.GetLongLength())) { throw Helper.GenerateAlphabetCheckFailureException(this.Alphabet, sequence); } int invalidQualityScore; // Validate quality scores if (!ValidateQualScores(qualityScores, this.FormatType, out invalidQualityScore)) { string message = string.Format(CultureInfo.CurrentUICulture, Properties.Resource.InvalidQualityScoreFound, invalidQualityScore, this.FormatType); throw new ArgumentOutOfRangeException("qualityScores", message); } } long len = qualityScores.GetLongLength(); this.sequenceData = new byte[sequence.GetLongLength()]; this.qualityScores = new sbyte[len]; Helper.Copy(sequence, this.sequenceData, sequence.GetLongLength()); for (long i = 0; i < len; i++) { this.qualityScores[i] = (sbyte)qualityScores[i]; } this.Count = this.sequenceData.GetLongLength(); }
/// <summary> /// Gets the ambiguous alphabet /// </summary> /// <param name="currentAlphabet">Alphabet to validate</param> /// <returns></returns> public static IAlphabet GetAmbiguousAlphabet(IAlphabet currentAlphabet) { if (currentAlphabet == DnaAlphabet.Instance || currentAlphabet == RnaAlphabet.Instance || currentAlphabet == ProteinAlphabet.Instance) { return AmbiguousAlphabetMap[currentAlphabet]; } return currentAlphabet; }
/// <summary> /// Create sparse sequence and insert all sequence items of alphabet. /// </summary> /// <param name="alphabet"></param> /// <param name="insertPosition"></param> /// <returns></returns> private static SparseSequence CreateSparseSequence(IAlphabet alphabet, int insertPosition) { // Create sequence item list var sequenceList = alphabet.ToList(); // Store sequence item in sparse sequence object using list of sequence items var sparseSeq = new SparseSequence(alphabet, insertPosition, sequenceList); return sparseSeq; }
/// <summary> /// Verifies if two given alphabets comes from the same base alphabet. /// </summary> /// <param name="alphabetA">First alphabet to compare.</param> /// <param name="alphabetB">Second alphabet to compare.</param> /// <returns>True if both alphabets comes from the same base class.</returns> public static bool CheckIsFromSameBase(IAlphabet alphabetA, IAlphabet alphabetB) { if (alphabetA == alphabetB) return true; IAlphabet innerAlphabetA = alphabetA, innerAlphabetB = alphabetB; if (AlphabetToBaseAlphabetMap.Keys.Contains(alphabetA)) innerAlphabetA = AlphabetToBaseAlphabetMap[alphabetA]; if (AlphabetToBaseAlphabetMap.Keys.Contains(alphabetB)) innerAlphabetB = AlphabetToBaseAlphabetMap[alphabetB]; return innerAlphabetA == innerAlphabetB; }
/// <summary> /// This methods loops through supported alphabet types and tries to identify /// the best alphabet type for the given symbols. /// </summary> /// <param name="symbols">Symbols on which auto detection should be performed.</param> /// <param name="offset">Offset from which the auto detection should start.</param> /// <param name="length">Number of symbols to process from the offset position.</param> /// <param name="identifiedAlphabetType">In case the symbols passed are a sub set of a bigger sequence, /// provide the already identified alphabet type of the sequence.</param> /// <returns>Returns the detected alphabet type or null if detection fails.</returns> public static IAlphabet AutoDetectAlphabet(byte[] symbols, long offset, long length, IAlphabet identifiedAlphabetType) { int currentPriorityIndex = 0; if (identifiedAlphabetType == null) { identifiedAlphabetType = AlphabetPriorityList[0]; } while (identifiedAlphabetType != AlphabetPriorityList[currentPriorityIndex]) { // Increment priority index and validate boundary condition if (++currentPriorityIndex == AlphabetPriorityList.Count) { throw new ArgumentException(Resource.CouldNotRecognizeAlphabet, "identifiedAlphabetType"); } } // Start validating against alphabet types according to their priority while (!AlphabetPriorityList[currentPriorityIndex].ValidateSequence(symbols, offset, length)) { // Increment priority index and validate boundary condition if (++currentPriorityIndex == AlphabetPriorityList.Count) { // Last ditch effort - look at all registered alphabets and see if any contain all the located symbols. foreach (var alphabet in All) { // Make sure alphabet supports validation -- if not, ignore it. try { if (alphabet.ValidateSequence(symbols, offset, length)) return alphabet; } catch (NotImplementedException) { } } // No alphabet found. return null; } } return AlphabetPriorityList[currentPriorityIndex]; }
/// <summary> /// Validate the list of sequences /// </summary> /// <param name="sequence">List of sequence</param> /// <param name="alphabetSet">Alphabet set</param> /// <param name="sequenceType">Type of sequence</param> public void ValidateSequenceList( ISequence sequence, IAlphabet alphabetSet, string sequenceType) { bool isValidLength = false; if (null == sequence) { string message = sequenceType == ReferenceSequence ? Properties.Resource.ReferenceSequenceCannotBeNull : Properties.Resource.QuerySequenceCannotBeNull; throw new ArgumentException(message); } if (sequence.Alphabet != alphabetSet) { string message = Properties.Resource.InputAlphabetsMismatch; throw new ArgumentException(message); } if (sequence.Count > LengthOfMUM) { isValidLength = true; } if (!isValidLength) { string message = String.Format( CultureInfo.CurrentCulture, Properties.Resource.InputSequenceMustBeGreaterThanMUM, LengthOfMUM); throw new ArgumentException(message); } }
/// <summary> /// Gets a default similarity matrix for assemblying any given sequence /// </summary> /// <returns>Similarity matrix name</returns> private string GetDefaultSM(IAlphabet sequenceAlphabet) { return sequenceAlphabet == Alphabets.DNA ? SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna.ToString() : (sequenceAlphabet == Alphabets.RNA ? SimilarityMatrix.StandardSimilarityMatrix.AmbiguousRna.ToString() : (sequenceAlphabet == Alphabets.Protein ? SimilarityMatrix.StandardSimilarityMatrix.Blosum50.ToString() : SimilarityMatrix.StandardSimilarityMatrix.AmbiguousDna.ToString())); }