Beispiel #1
0
        public void StartWorkflow()
        {
            //--------------------------------------------------------------------------------------------------------------

            RaiseProgress("Initializing...", 0);
            /* internal program stuff
             * creating all necessary strings */

            Blast blastObject = new Blast();
            allIDs allIDsObject = new allIDs();

            string path_wholeOutput = _storageLocationPath + "NOmESS_Output\\";
            string path_temporaryWorkingDirectory = path_wholeOutput + "temp\\";
            string path_step1 = path_wholeOutput + "Step 1\\";
            string path_step2 = path_wholeOutput + "Step 2\\";
            string path_step3 = path_wholeOutput + "Step 3\\";
            string path_step4 = path_wholeOutput + "Step 4\\";
            string path_finalOutput = path_wholeOutput + "Final_Output\\";
            string path_sortedDictionaryFiles_step2 = path_step2 + "SortedDictionaryFiles_Step2\\";
            string path_sortedDictionaryFiles_step3 = path_step3 + "SortedDictionaryFiles_Step3\\";
            string path_allSequenceFilesHomologName = path_finalOutput + "allSequences_filename=scaffoldID\\";

            //create all needed folders
            Directory.CreateDirectory(path_wholeOutput);
            Directory.CreateDirectory(path_temporaryWorkingDirectory);
            Directory.CreateDirectory(path_step1);
            Directory.CreateDirectory(path_step2);
            Directory.CreateDirectory(path_step3);
            Directory.CreateDirectory(path_step4);
            Directory.CreateDirectory(path_sortedDictionaryFiles_step2);
            Directory.CreateDirectory(path_sortedDictionaryFiles_step3);

            /*copy the executable files into the working directory
             * --> blastp.exe and makeblastdb.exe
             * --> cd-hit.exe */
            File.Copy(_blastPath + "blastp.exe", path_temporaryWorkingDirectory + "blastp.exe", true);
            File.Copy(_blastPath + "makeblastdb.exe", path_temporaryWorkingDirectory + "makeblastdb.exe", true);
            File.Copy(_cdHitPath + "cd_hit.exe", path_temporaryWorkingDirectory + "cd-hit.exe", true);

            /*print settings in a file
             */
            StreamWriter writer = new StreamWriter(path_wholeOutput + "used_settings.txt");

            writer.WriteLine("Required parameters");
            writer.WriteLine();
            writer.WriteLine("Storage location\t" + _storageLocationPath);
            writer.WriteLine("Blastp\t" + _blastPath);
            writer.WriteLine("Cd-hit\t" + _cdHitPath);
            writer.WriteLine("Input set\t" + _inputSetPath);
            writer.WriteLine("Scaffold set\t" + _scaffoldSetPath);
            writer.WriteLine(); writer.WriteLine(); writer.WriteLine(); writer.WriteLine(); writer.WriteLine();

            writer.WriteLine("Optional parameters");
            writer.WriteLine();
            writer.WriteLine("ID prefixes");
            writer.WriteLine("Prefix step 1\t" + _prefixNewId1);
            writer.WriteLine("Prefix step 2\t" + _prefixNewId2);
            writer.WriteLine("Prefix step 3\t" + _prefixNewId3);
            writer.WriteLine("Prefix step 4\t" + _prefixNewId4);
            writer.WriteLine();

            writer.WriteLine("Preprocessing");
            writer.WriteLine("Preprocessing ( step 1 )\t" + _preprocessing);
            writer.WriteLine("Non amino acid characters\t" + _noAaMotiv);
            writer.WriteLine("Minimal sequence length\t" + _minimalSeqLength);
            writer.WriteLine();

            writer.WriteLine("Identity thresholds (cd-hit)");
            writer.WriteLine("Scaffold set\t" + _cS);
            writer.WriteLine("Step 1\t" + _c1);
            writer.WriteLine("Step 2\t" + _c2);
            writer.WriteLine("Step 3\t" + _c3);
            writer.WriteLine();

            writer.WriteLine("Homology thresholds (blastp)");
            writer.WriteLine("Step 2: percentage identity\t" + _percentageIdentity2);
            writer.WriteLine("Step 2: e-value\t" + _eValue2);
            writer.WriteLine("Step 2: bit score\t" + _bitScore2);
            writer.WriteLine("Step 2: query length\t" + _queryLength2);
            writer.WriteLine("Step 2: alignment length\t" + _alignmentLength2);
            writer.WriteLine("Step 3: percentage identity\t" + _percentageIdentity3);
            writer.WriteLine("Step 3: e-value\t" + _eValue3);
            writer.WriteLine("Step 3: bit score\t" + _bitScore3);
            writer.WriteLine("Step 3: query length\t" + _queryLength3);
            writer.WriteLine("Step 3: alignment length\t" + _alignmentLength3);
            writer.WriteLine("Step 4: percentage identity\t" + _percentageIdentity4);
            writer.WriteLine("Step 4: e-value\t" + _eValue4);
            writer.WriteLine("Step 4: bit score\t" + _bitScore4);
            writer.WriteLine("Step 4: query length\t" + _queryLength4);
            writer.WriteLine("Step 4: alignment length\t" + _alignmentLength4);

            writer.Close();

            /*use cd-hit for reducing sequences of homologous database --> 0.95 percentage identity
             *                                                         --> copy homologous database into working directory
             *                                                         --> copy output file in storage location */
            String ids_scaffold = path_wholeOutput + "IDs_scaffoldOrganism.txt";
            Dictionary<String, String> oldDatabase = Database.GetDictionary(_scaffoldSetPath);
            Database.CreateInternalScaffoldId(oldDatabase, ">scaffold_", path_temporaryWorkingDirectory + "scaffoldSet.fasta", ids_scaffold);

            CdHitCall(path_temporaryWorkingDirectory, "scaffoldSet.fasta", "scaffoldSet-Cl.fasta", _cS);
            File.Copy(path_temporaryWorkingDirectory + "scaffoldSet-Cl.fasta", path_wholeOutput + "scaffoldSet-Cl.fasta", true);
            string scaffoldSet = _storageLocationPath + "scaffoldSet-Cl.fasta";
            File.Delete(path_temporaryWorkingDirectory + "scaffoldSet.fasta");

            /*preprocessing / step 0 --> cut the sequences containing non aa characters before and after those
             *                       --> check if sequence longer than given minimal length */
            string database_step1 = path_step1 + "STEP1.fasta";
            string ids_step1 = path_step1 + "IDs_Step1.txt";

            if (_preprocessing)
            {
            RaiseProgress("Step 1: Preprocessing...", 5);
                Database.Preprocessing(_inputSetPath, database_step1, ids_step1, _noAaMotiv, _minimalSeqLength, _prefixNewId1);
            }
            else
            {
                Dictionary<String, String> oldInputDatabase = Database.GetDictionary(_inputSetPath);
                Database.CreateInternalScaffoldId(oldInputDatabase, _prefixNewId1, database_step1, ids_step1);
            }

            /*use cd-hit for reducing sequences of database_step1 --> 1.0 percentage identity
             *                                                    --> copy database_step1 into working directory
             *                                                    --> copy output file in Output_Step0 */
            File.Copy(database_step1, path_temporaryWorkingDirectory + "STEP1.fasta", true);
            CdHitCall(path_temporaryWorkingDirectory, "STEP1.fasta", "STEP1-Cl.fasta",_c1);

            File.Copy(path_temporaryWorkingDirectory + "STEP1-Cl.fasta", path_step1 + "STEP1-Cl.fasta", true);

            RaiseProgress("Step 2: Fragment grouping...", 15);
            /*first blast --> copy homologous database into working directory
             *            --> use makeblastdb.exe and blastp.exe
                          --> copy blast output into Output_Step1 */
            Process proc_MakeDB = new Process();
            proc_MakeDB.StartInfo.WorkingDirectory = @path_temporaryWorkingDirectory;
            proc_MakeDB.StartInfo.FileName = _blastPath + "makeblastdb.exe";
            proc_MakeDB.StartInfo.Arguments = "-in scaffoldSet-Cl.fasta -out scaffoldDatabase";
            proc_MakeDB.StartInfo.UseShellExecute = true;
            proc_MakeDB.StartInfo.RedirectStandardError = false;
            proc_MakeDB.Start();
            proc_MakeDB.WaitForExit();
            proc_MakeDB.Close();

            BlastCall(path_temporaryWorkingDirectory, "STEP1-Cl.fasta", "blastForStep2.txt");

            File.Copy(path_temporaryWorkingDirectory + "blastForStep2.txt", path_step2 + "blastForStep2.txt", true);
            File.Delete(path_temporaryWorkingDirectory + "STEP1.fasta");
            File.Delete(path_temporaryWorkingDirectory + "blastForStep2.txt");

            /*step 1 --> sequence assembly */
            string blastOutput_step2 = path_step2 + "blastForStep2.txt";
            string database_step2 = path_step2 + "STEP2.fasta";
            string ids_step2 = path_step2 + "IDs_Step2.txt";
            string sortedList_step2 = path_step2 + "sortedLists_Step2.txt";
            string restSequences_step2 = path_step2 + "restSequencePieces_Step2.fasta";
            string blastDictionary_step2 = path_step2 + "BlastDictionary_Step2.txt";
            string notTakenSequences_step2 = path_step2 + "SNT2.fasta";
            string seqsWithoutScaffoldPath = path_step2 + "SWS.fasta";
            Boolean printSWS = true;

            RaiseProgress("Step 2: Assembly...", 15);
            blastObject.CreateNonRedundantDatabase_step2(blastOutput_step2, scaffoldSet, path_step1 + "STEP1-Cl.fasta", database_step2, ids_step2, sortedList_step2, restSequences_step2, blastDictionary_step2, path_sortedDictionaryFiles_step2, notTakenSequences_step2, _percentageIdentity2, _eValue2, _bitScore2, _queryLength2, _alignmentLength2, _prefixNewId2, seqsWithoutScaffoldPath, printSWS);

            /*use cd-hit for reducing sequences of database_step2 --> 0.95 percentage identity
             *                                                    --> copy database_step2 into working directory
             *                                                    --> copy output file in Output_Step1 */
            File.Copy(database_step2, path_temporaryWorkingDirectory + "STEP2.fasta", true);
            CdHitCall(path_temporaryWorkingDirectory, "STEP2.fasta", "STEP2-Cl.fasta", _c2);

            File.Copy(path_temporaryWorkingDirectory + "STEP2-Cl.fasta", path_step2 + "STEP2-Cl.fasta", true);
            File.Delete(path_temporaryWorkingDirectory + "STEP2.fasta");

            RaiseProgress("Step 3: Fragment grouping...", 55);
            /*second blast --> use blastp.exe
                           --> copy blast output into Output_Step2 */
            BlastCall(path_temporaryWorkingDirectory, "STEP2-Cl.fasta", "blastForStep3.txt");

            File.Copy(path_temporaryWorkingDirectory + "blastForStep3.txt", path_step3 + "blastForStep3.txt", true);
            File.Delete(path_temporaryWorkingDirectory + "STEP2-Cl.fasta");
            File.Delete(path_temporaryWorkingDirectory + "blastForStep3.txt");

            /* step 2 --> concatenation of non-overlapping sequence pieces */
            string blastOutput_step3 = path_step3 + "blastForStep3.txt";
            string database_step3 = path_step3 + "STEP3.fasta";
            string ids_step3 = path_step3 + "IDs_Step3.txt";
            string sortedList_step3 = path_step3 + "sortedLists_Step3.txt";
            string blastDictionary_step3 = path_step3 + "BlastDictionary_Step3.txt";
            string notTakenSequences_step3 = path_step3 + "SNT3.fasta";

            RaiseProgress("Step 3: Concatenation...", 55);
            blastObject.CreateNonRedundantDatabase_step3(blastOutput_step3, path_step2 + "STEP2-Cl.fasta", database_step3, ids_step3, sortedList_step3, path_sortedDictionaryFiles_step3, blastDictionary_step3, notTakenSequences_step3, _percentageIdentity3, _eValue3, _bitScore3, _queryLength3, _alignmentLength3, _prefixNewId3);

            /*use cd-hit for reducing sequences of database_step3 --> 0.95 percentage identity
             *                                                    --> copy database_step3 into working directory
             *                                                    --> copy output file in Output_Step2 */
            File.Copy(database_step3, path_temporaryWorkingDirectory + "STEP3.fasta", true);
            CdHitCall(path_temporaryWorkingDirectory, "STEP3.fasta", "STEP3-Cl.fasta", _c3);

            File.Copy(path_temporaryWorkingDirectory + "STEP3-Cl.fasta", path_step3 + "STEP3-Cl.fasta", true);
            File.Delete(path_temporaryWorkingDirectory + "STEP3.fasta");

            RaiseProgress("Step 4: Fragment grouping...", 80);
            /*third blast --> use blastp.exe
                          --> copy blast output into Output_Step3 */
            BlastCall(path_temporaryWorkingDirectory, "STEP3-Cl.fasta", "blastForStep4.txt");

            File.Copy(path_temporaryWorkingDirectory + "blastForStep4.txt", path_step4 + "blastForStep4.txt", true);

            /*step 3 --> take longest representant for each homologous protein */
            string blastOutput_step4 = path_step4 + "blastForStep4.txt";
            string ids_step4 = path_step4 + "IDs_Step4.txt";
            string sortedList_step4 = path_step4 + "sortedLists_Step4.txt";
            string longestSequences_step4 = path_step4 + "STEP4.fasta";
            string shorterSequences_step4 = path_step4 + "shorterSequences_Step4.fasta";
            string blastDictionary_step4 = path_step4 + "BlastDictionary_Step4.txt";
            string notTakenSequences_step4 = path_step4 + "SNT4.fasta";

            RaiseProgress("Step 4: Representative selection...", 80);
            blastObject.CreateNonRedundantDatabase_step4(blastOutput_step4, path_step3 + "STEP3-Cl.fasta", ids_step4, sortedList_step4, longestSequences_step4, shorterSequences_step4, blastDictionary_step4, notTakenSequences_step4, _percentageIdentity4, _eValue4, _bitScore4, _queryLength4, _alignmentLength4, _prefixNewId4);

            //creating file with all IDs and for each homolog sequence all including sequences
            Dictionary<string, List<string>> ids_0 = allIDsObject.GetDict(ids_step1, 0, 1);
            Dictionary<string, List<string>> ids_1 = allIDsObject.GetDict(ids_step2, 0, 2);
            Dictionary<string, List<string>> ids_2 = allIDsObject.GetDict(ids_step3, 0, 2);
            Dictionary<string, List<string>> ids_3 = allIDsObject.GetDict(ids_step4, 0, 1);
            Dictionary<string, List<string>> ids_homo = allIDsObject.GetDict(ids_scaffold, 0, 1);

            Dictionary<string, string> databaseOriginal = Database.GetDictionary(_inputSetPath);
            Dictionary<string, string> databaseAfter0 = Database.GetDictionary(path_step1 + "STEP1-Cl.fasta");
            Dictionary<string, string> databaseAfter1 = Database.GetDictionary(path_step2 + "STEP2-Cl.fasta");
            Dictionary<string, string> databaseAfter2 = Database.GetDictionary(path_step3 + "STEP3-Cl.fasta");
            Dictionary<string, string> databaseAfter3 = Database.GetDictionary(path_step4 + "STEP4.fasta");
            Dictionary<string, string> scaffoldDatabase = Database.GetDictionary(path_wholeOutput + "scaffoldSet-Cl.fasta");

            string idOutputPath = path_wholeOutput + "IDs.txt";
            allIDsObject.createFinalIDOutput(ids_0, ids_1, ids_2, ids_3, ids_homo, databaseOriginal, databaseAfter0, databaseAfter1, databaseAfter2, databaseAfter3, scaffoldDatabase, idOutputPath, path_allSequenceFilesHomologName, 1);

            //copying all interesting files into the output folder
            File.Copy(path_step1 + "STEP1-Cl.fasta", path_wholeOutput + "STEP1-Cl.fasta",true);
            File.Copy(path_step2 + "STEP2.fasta", path_wholeOutput + "STEP2.fasta", true);
            File.Copy(path_step2 + "STEP2-Cl.fasta", path_wholeOutput + "STEP2-Cl.fasta", true);
            File.Copy(path_step3 + "STEP3.fasta", path_wholeOutput + "STEP3.fasta", true);
            File.Copy(path_step3 + "STEP3-Cl.fasta", path_wholeOutput + "STEP3-Cl.fasta", true);
            File.Copy(path_step4 + "STEP4.fasta", path_wholeOutput + "STEP4.fasta", true);
            File.Copy(path_step2 + "SNT2.fasta", path_wholeOutput + "SNT2.fasta", true);
            File.Copy(path_step3 + "SNT3.fasta", path_wholeOutput + "SNT3.fasta", true);
            File.Copy(path_step4 + "SNT4.fasta", path_wholeOutput + "SNT4.fasta", true);
            File.Copy(path_step2 + "SWS.fasta", path_wholeOutput + "SWS.fasta", true);

            //deleting the non necessary files
            Directory.Delete(path_step1, true);
            Directory.Delete(path_step2, true);
            Directory.Delete(path_step3, true);
            Directory.Delete(path_step4, true);
            Directory.Delete(path_wholeOutput + "temp", true);
            File.Delete(path_wholeOutput + "scaffoldSet-Cl.fasta");
            File.Delete(path_wholeOutput + "IDs.txt");
            File.Delete(path_wholeOutput + "IDs_scaffoldOrganism.txt");

            //pop up "Done!" window
            RaiseProgress("Done!", 100);
            Done done = new Done();
            done.ShowDialog();
        }