static void Main(string[] args) { Console.WriteLine("Choose operation by writing the corresponding choice (1 or 2):"); Console.WriteLine("(1) Parse Retrovirus Database Html Dump And Generate Positions File"); Console.WriteLine("(2) Generate Training and Validation Set From Existing Positions File"); string optionChosen = Console.ReadLine(); while (!optionChosen.Contains("1") && !optionChosen.Contains("2")) { Console.WriteLine("Unknown option selected, please choose (1 or 2)!"); Console.WriteLine("(1) Parse Retrovirus Database Html Dump And Generate Positions File"); Console.WriteLine("(2) Generate Training and Validation Set From Existing Positions File"); optionChosen = Console.ReadLine(); } string positionsPath = workingDirectoryFilePath + "positions.txt"; string labelsPath = workingDirectoryFilePath + "labels.txt"; string insertsPath = workingDirectoryFilePath + "inserts.txt"; string DNAFileOutPath = workingDirectoryFilePath + "DNAString.txt"; string DNAFileOutValidationPath = workingDirectoryFilePath + "DNAString_validation.txt"; string validationInsertsPath = workingDirectoryFilePath + "inserts_validation.txt"; string validationLabelsPath = workingDirectoryFilePath + "labels_validation.txt"; if (optionChosen.Contains("1")) { Console.WriteLine("Make sure that you have the dump (result.html) in same path as this executable."); Console.WriteLine("Press ENTER to continue parsing the dump and generating the positions file"); Console.ReadLine(); Parser.generatePositionsFile(positionsPath, new System.Collections.Hashtable()); Console.WriteLine("DONE!, Press ENTER to exit"); Console.ReadLine(); } else if (optionChosen.Contains("2")) { Console.WriteLine("Make sure that you have the positions file (positions.txt) in same path as this executable."); Console.WriteLine("Press ENTER to continue parsing the dump and generating the positions file"); Console.ReadLine(); Console.WriteLine("Reading existing positions..."); System.Collections.Hashtable existingPositionsHash = DatasetGeneratorUtil.getExistingPositionsSorted(positionsPath); Console.WriteLine("Done reading positions, generating data.."); DatasetFileGenerator.getExistingPositionsAndGenerateData(positionsPath, labelsPath, insertsPath, validationInsertsPath, validationLabelsPath, DNAFileOutPath, DNAFileOutValidationPath, existingPositionsHash); Console.WriteLine("DONE!, Press ENTER to exit"); Console.ReadLine(); } }
public static async void getExistingPositionsAndGenerateData(string positionsFile, string labelsFileOutputPath, string insertsFileOutputPath, string validationFileOutputPath, string validationLabelsFileOutputPath, string DNAStringOutputFilePath, string DNAStringOutputValidationFilePath, System.Collections.Hashtable existingPositionsHash) { float totalTargetLength; float percentDone; List <int> existingPositions = new List <int>(); string tmp; string[] splitted; int position; Task <DNATuple> tskTup1; Task <DNATuple> tskTup2; DNATuple tup; bool printToValidation = false; KeyValuePair <string, int> randomFalsePosition; using (FileStream fs = File.Open(positionsFile, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (BufferedStream bs = new BufferedStream(fs)) using (StreamReader sr = new StreamReader(bs)) { using (StreamWriter swValidation = File.AppendText(validationFileOutputPath)) { using (StreamWriter swValidationLabels = File.AppendText(validationLabelsFileOutputPath)) { using (StreamWriter swLabels = File.AppendText(labelsFileOutputPath)) { using (StreamWriter swInserts = File.AppendText(insertsFileOutputPath)) { using (StreamWriter swDNAOutput = File.AppendText(DNAStringOutputFilePath)) { using (StreamWriter swDNAValidationOutput = File.AppendText(DNAStringOutputValidationFilePath)) { totalTargetLength = sr.BaseStream.Length; while (sr.Peek() != -1) { tmp = sr.ReadLine(); splitted = tmp.Split(','); position = Int32.Parse(splitted[0]); tskTup1 = Task <DNATuple> .Factory.StartNew(() => DatasetGeneratorUtil.getDNASequence(position, splitted[1].Replace("_", ""))); randomFalsePosition = DatasetGeneratorUtil.getRandomPostion(existingPositionsHash); tskTup2 = Task <DNATuple> .Factory.StartNew(() => DatasetGeneratorUtil.getDNASequence(randomFalsePosition.Value, randomFalsePosition.Key)); tskTup1.Wait(); tup = tskTup1.Result; if (tup == null) { continue; //unknown chromosome file specified, continue } tmp = DatasetGeneratorUtil.DNAStringToOneHotEncoding(tup.beforePosition + tup.afterPosition); if (printToValidation) { swDNAValidationOutput.WriteLine(tup.beforePosition + tup.afterPosition); swValidation.WriteLine(tmp); swValidationLabels.WriteLine("1"); } else { swDNAOutput.WriteLine(tup.beforePosition + tup.afterPosition); swInserts.WriteLine(tmp); swLabels.WriteLine("1"); } tskTup2.Wait(); tup = tskTup2.Result; tmp = DatasetGeneratorUtil.DNAStringToOneHotEncoding(tup.beforePosition + tup.afterPosition); if (printToValidation) { swDNAValidationOutput.WriteLine(tup.beforePosition + tup.afterPosition); swValidation.WriteLine(tmp); swValidationLabels.WriteLine("0"); } else { swDNAOutput.WriteLine(tup.beforePosition + tup.afterPosition); swInserts.WriteLine(tmp); swLabels.WriteLine("0"); } percentDone = ((float)sr.BaseStream.Position) / totalTargetLength; if (percentDone > 0.90f) { printToValidation = true; } Console.WriteLine("Creating files:" + percentDone.ToString("0.0000") + "%"); } swInserts.Flush(); swLabels.Flush(); swValidation.Flush(); } } } } } } } }