public void SetUp() { this.BiopatMLFilePath = string.Empty; this.BioList = null; this.MyPatterns = null; SearchPosition = 1; }
/// <summary> /// Reads the Genbank file and have it parsed by MBF library. /// </summary> /// <param name="genbankFileURL">Your genbank file path</param> /// <returns></returns> private SequenceList ParseSequencePath (string genbankFileURL) { if (IsOnline) throw new NotImplementedException ("online genbank reading is not supported in this version!"); //Download the file and parse it //Create the parser first ISequenceParser gbParser = new GenBankParser(); //Always Try parsing multi sequence in a file List<ISequence> mbfSequences = gbParser.Parse(genbankFileURL); SequenceList bioSeqList = new SequenceList(); foreach (Sequence mbfseq in mbfSequences) { ConvertToBioPatMLSeq(mbfseq); bioSeqList.Add(ConvertToBioPatMLSeq(mbfseq)); } return bioSeqList; }
public void TestMotifPattern_Motif () { BiopatMLFilePath = "BioPaperTestData/MotifPattern/Motif.xml"; using ( BioPatMBF_Reader gbReader = new BioPatMBF_Reader() ) { BioList = gbReader.Read( Global.GetResourceReader( _singleDnaSeqGenBankFilename ) ); } MyPatterns = DefinitionIO.Read( Global.GetResourceReader( BiopatMLFilePath ) ); FeatureList Matches = BioList[0].Search( SearchPosition, BioList[0].Length, MyPatterns.Pattern ); //According to Jacobi library total matches should be 57 Assert.AreEqual( 57, Matches.Count ); Assert.AreEqual( "Pribnow-box", Matches.Name ); //Perform some random checks from the 57 list Match matched = (Match) Matches[10]; //try get the 11th matched Assert.AreEqual( 0.66, matched.Similarity, 1e-2 ); Assert.AreEqual( 6, matched.Length ); Assert.AreEqual( "ttttat", matched.Letters() ); //try the first match matched = (Match) Matches[0]; Assert.AreEqual( AlphabetFactory.Instance( AlphabetType.DNA ), matched.Alphabet ); Assert.AreEqual( 6, matched.Length ); Assert.AreEqual( 0.5, matched.Similarity, 1e-2 ); // Check the last match matched = (Match) Matches[56]; Assert.AreEqual( 0.5, matched.Similarity, 1e-2 ); Assert.AreEqual( "tttctt", matched.Letters() ); }
public void TestStructuredPattern_SeriesAll() { BiopatMLFilePath = "BioPaperTestData/StructuredPattern/SeriesAll.xml"; using (BioPatMBF_Reader gbReader = new BioPatMBF_Reader()) { BioList = gbReader.Read( Global.GetResourceReader( _singleDnaSeqGenBankFilename ) ); } MyPatterns = DefinitionIO.Read( Global.GetResourceReader( BiopatMLFilePath ) ); FeatureList Matches = BioList[0].Search( SearchPosition, BioList[0].Length, MyPatterns.Pattern ); //expecting 49 matches based on the old jacobi result Assert.AreEqual(49, Matches.Count); Match matched = (Match)Matches[0]; //Query the 1st matched from the list of matches Assert.AreEqual(3, matched.SubMatches.Count); //should have 3 sub matches Assert.AreEqual("aattt", matched.SubMatches[0].Letters()); Assert.AreEqual("tataagtg", matched.SubMatches[1].Letters()); Assert.AreEqual("ttcaa", matched.SubMatches[2].Letters()); //And finally the main matched Assert.AreEqual("aattttataagtgttcaa", matched.Letters()); }
/// <summary> /// Constructs a Block of aligned sequences /// (<see cref="QUT.Bio.BioPatML.Patterns.PWM"> PWM </see>). /// </summary> /// <param name="name">Name for element block</param> /// <param name="sequenceList"> List of aligned sequences. </param> /// <param name="background"> Histogram with base counts of the background /// sequences.</param> /// <param name="threshold"> Similarity threshold. </param> public Block (String name, SequenceList sequenceList, HistogramSymbol background, double threshold) : base(name, sequenceList[0].Alphabet, threshold) { Estimate(sequenceList, background); }
/** Tests the adding of a sequence list to the histogram */ public void TestAddSequenceList() { SequenceList list = new SequenceList(); list.Add(new Sequence(AlphabetType.DNA, "actga")); list.Add(new Sequence(AlphabetType.DNA, "ctaca")); histo.Add(list); Assert.AreEqual(4, histo.HistoValue(alpha['a'])); Assert.AreEqual(3, histo.HistoValue(alpha['c'])); Assert.AreEqual(2, histo.HistoValue(alpha['t'])); Assert.AreEqual(1, histo.HistoValue(alpha['g'])); }
/** Tests constructor */ public void TestConstructor() { SequenceList list = new SequenceList(); list.Add(new Sequence(AlphabetType.DNA, "aa", false)); list.Add( new Sequence( AlphabetType.DNA, "at", false ) ); Block block = new Block("test", list, null, 0.0); Assert.AreEqual(1.000, block.Get('a', 0), 1e-3); Assert.AreEqual(-0.584, block.Get('c', 0), 1e-3); Assert.AreEqual(-0.584, block.Get('t', 0), 1e-3); Assert.AreEqual(-0.584, block.Get('g', 0), 1e-3); Assert.AreEqual(0.415, block.Get('a', 1), 1e-3); Assert.AreEqual(-0.584, block.Get('c', 1), 1e-3); Assert.AreEqual(0.415, block.Get('t', 1), 1e-3); Assert.AreEqual(-0.584, block.Get('g', 1), 1e-3); }
public void TestRegionalPattern_Gap () { BiopatMLFilePath = "BioPaperTestData/RegionalPattern/RegionalGap.xml"; BioPatMBF_Reader gbReader = new BioPatMBF_Reader(); BioList = gbReader.Read( Global.GetResourceReader( _singleProteinSeqGenBankFilename ) ); MyPatterns = DefinitionIO.Read( Global.GetResourceReader( BiopatMLFilePath ) ); FeatureList Matches = BioList[0].Search( SearchPosition, BioList[0].Length, MyPatterns.Pattern ); //Total matches according to old jacobi is 309 Assert.AreEqual( 410, Matches.Count ); //Checks the first match Assert.AreEqual( "MT", Matches[0].MainSequence.Letters( Matches[0].Start, Matches[0].End ) ); //Checks if the last Match is in correect start and end pos Assert.AreEqual( "CE", Matches[409].MainSequence.Letters( Matches[409].Start, Matches[409].End ) ); }
/// <summary> /// Reads in the fasta file. /// </summary> /// <param name="reader">your local filepath for genbank</param> /// <returns>list of BioPatML Sequences</returns> public override SequenceList Read(TextReader reader) { //Create the parser first ISequenceParser fastaParser = new FastaParser(); List<ISequence> mbfSequences = fastaParser.Parse(reader); SequenceList bioSeqList = new SequenceList(); foreach (Sequence mbfseq in mbfSequences) { bioSeqList.Add(ConvertToBioPatMLSeq(mbfseq)); } return bioSeqList; }
/// <summary> /// The param could also be a stringreader. /// </summary> /// <param name="reader"></param> /// <returns></returns> private SequenceList ParseSequencePath (TextReader reader) { //Create the parser first ISequenceParser gbParser = new GenBankParser(); //Always Try parsing multi sequence in a reader List<ISequence> mbfSequences = gbParser.Parse(reader); SequenceList bioSeqList = new SequenceList(); foreach (Sequence mbfseq in mbfSequences) { ConvertToBioPatMLSeq(mbfseq); bioSeqList.Add(ConvertToBioPatMLSeq(mbfseq)); } return bioSeqList; }
/// <summary> /// Estimates the weights of the PWM that's behind a Block pattern. /// </summary> /// <exception cref="System.ArgumentException"> /// Thrown when sequences length are not equal</exception> /// <param name="sequenceList"> List of aligned sequences. </param> /// <param name="background"> Histogram with base counts of the background /// sequences. Can be null. In that case all frequencies are set equally.</param> private void Estimate (SequenceList sequenceList, HistogramSymbol background) { int length = sequenceList.MinLength(); if (sequenceList.MaxLength() != length) throw new ArgumentException ("Sequences must be of equal length!"); if (background == null) { background = new HistogramSymbol(); foreach (Symbol sym in PWMalphabet) background.Add(sym); } base.Init(length); base.Estimate(sequenceList, 1, background); }
public void TestRegionalPattern_Composition () { BiopatMLFilePath = "BioPaperTestData/RegionalPattern/RegionalComposition.xml"; using ( BioPatMBF_Reader gbReader = new BioPatMBF_Reader() ) { BioList = gbReader.Read( Global.GetResourceReader( _singleProteinSeqGenBankFilename ) ); } MyPatterns = DefinitionIO.Read( Global.GetResourceReader( BiopatMLFilePath ) ); FeatureList Matches = BioList[0].Search( SearchPosition, BioList[0].Length, MyPatterns.Pattern ); //Total matches according to old jacobi is 309 Assert.AreEqual( 49, Matches.Count ); //Checks the first match Assert.AreEqual( "GATLFKTRCLQCHTV", Matches[0].MainSequence.Letters( Matches[0].Start, Matches[0].End ) ); //Check see if the pattern used to match has the correct name for its matches Assert.AreEqual( "transmembrane domain", Matches.Name ); Assert.AreEqual( 12, Matches[0].Start ); Assert.AreEqual( 26, Matches[0].End ); //Checks if the last Match is in correect start and end pos Assert.AreEqual( "KDRNDLITYLKKACE", Matches[48].MainSequence.Letters( Matches[48].Start, Matches[48].End ) ); }
/// <summary> /// Constructs a histogram based on the provided list of sequencs. /// </summary> /// <param name="sequenceList"> List of sequences. </param> public void Add(SequenceList sequenceList) { for (int i = 0; i < sequenceList.Count; i++) Add(sequenceList[i]); }
/// <summary> /// Constructs a histogram based on the provided list of sequencs. /// </summary> /// <param name="sequenceList"> List of sequences. </param> public HistogramSymbol(SequenceList sequenceList) { Add(sequenceList); }
/// <summary> /// Reads the parameters and populate the attributes for this pattern. /// </summary> /// <exception cref="System.ArgumentNullException"> /// Thrown when sequences in blocks are missing.</exception> /// <param name="node"></param> /// <param name="definition">The Definition element where the node sits in</param> public override void ReadNode (XmlNode node, Definition definition) { PatternName = (XMLHelper.GetAttrValueString(node, "name")); Threshold = (XMLHelper.GetAttrValDouble(node, "threshold")); Impact = (XMLHelper.GetAttrValDouble(node, "impact")); PWMalphabet = AlphabetFactory.Instance (XMLHelper.GetAttrValueString(node, "alphabet")); SequenceList seqList = new SequenceList(); node = node.FirstChild; while (node != null) { if (node.Name.Equals("Sequence")) { String letters = node.InnerText.Trim(); if (letters == null) throw new ArgumentNullException ("Sequences in Block are missing!"); seqList.Add(new Sequence(PWMalphabet, letters, false)); } node = node.NextSibling; } Estimate(seqList, null); }
public void TestMotifPattern_Block () { BiopatMLFilePath = "BioPaperTestData/MotifPattern/Block.xml"; BioPatMBF_Reader gbReader = new BioPatMBF_Reader(); BioList = gbReader.Read( Global.GetResourceReader( _sampleGenBankFile2 ) ); MyPatterns = DefinitionIO.Read( Global.GetResourceReader( BiopatMLFilePath ) ); FeatureList Matches = BioList[0].Search( SearchPosition, BioList[0].Length, MyPatterns.Pattern ); Assert.AreEqual( 3, Matches.Count ); Assert.AreEqual( "Pribnow-box", Matches.Name ); Match matched = (Match) Matches[0]; Assert.AreEqual( 0.83, matched.Similarity, 1e-2 ); Assert.AreEqual( "tataac", matched.Letters() ); matched = (Match) Matches[1]; Assert.AreEqual( 0.76, matched.Similarity, 1e-2 ); Assert.AreEqual( "taacat", matched.Letters() ); matched = (Match) Matches[2]; Assert.AreEqual( 0.72, matched.Similarity, 1e-2 ); Assert.AreEqual( "cataaa", matched.Letters() ); }
public void TestMotifPattern_PWM () { BiopatMLFilePath = "BioPaperTestData/MotifPattern/PWM.xml"; using ( BioPatMBF_Reader gbReader = new BioPatMBF_Reader() ) { BioList = gbReader.Read( Global.GetResourceReader( _sampleGenBankFile2 ) ); } MyPatterns = DefinitionIO.Read( Global.GetResourceReader( BiopatMLFilePath ) ); FeatureList Matches = BioList[0].Search( SearchPosition, BioList[0].Length, MyPatterns.Pattern ); Assert.AreEqual( 14, Matches.Count ); Assert.AreEqual( "Pribnow-box", Matches.Name ); Match matched = (Match) Matches[0]; Assert.AreEqual( 17, matched.Start ); Assert.AreEqual( 22, matched.End ); Assert.AreEqual( 0.67, matched.Similarity, 1e-2 ); Assert.AreEqual( "tctcct", matched.Letters() ); matched = (Match) Matches[1]; Assert.AreEqual( 25, matched.Start ); Assert.AreEqual( 30, matched.End ); Assert.AreEqual( 0.61, matched.Similarity, 1e-2 ); Assert.AreEqual( "ttggct", matched.Letters() ); matched = (Match) Matches[13]; Assert.AreEqual( 268, matched.Start ); Assert.AreEqual( 273, matched.End ); Assert.AreEqual( 0.67, matched.Similarity, 1e-2 ); Assert.AreEqual( "tgtgct", matched.Letters() ); }