private static void writeTestDataset(List <TextInfo> works, string headers, Options options, bool writeToConsole = true) { if (writeToConsole) { ConsoleHelper.Write(ConsoleColor.DarkGray, $"Klargør import af {works.Count} tekster til TEST-sæt..."); } using (var writer = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8)) { writer.WriteLine(headers); foreach (TextInfo work in works) { if (writeToConsole) { ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}"); } List <string> list = new List <string>(); var preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8, true); work.TextSummary = new Stylo.TextSummary(work, preProcessedText); switch (options.ImportFormat) { case TextImportFormat.SENTENCES: var sentences = Stylo.GetSentences(preProcessedText, options.removeStopWords); int counter = 0; string ss = ""; for (int i = 0; i < sentences.Count; i++) { string sentence = sentences[i]; if (counter < options.testClusterSize) { ss = ss + sentence + " "; counter++; } else { list.Add(ss); ss = sentence; counter = 0; } } list.Add(ss); break; case TextImportFormat.WORDS: list = Stylo.getWordClusters(preProcessedText, options.testClusterSize, options.removeStopWords); break; //case TextImportFormat.CHARS: // list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize); // break; //default: // list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize); // break; } // List<string> list = Cluster Sentences.GetSentences(s.Text); List <ImportDataModel> data = new List <ImportDataModel>(); foreach (string value in list) { ImportDataModel cluster = new ImportDataModel(work, value); //cluster.DifferentWords = cluster.DifferentWords / options.testClusterSize; //cluster.LongWords = cluster.LongWords / options.testClusterSize; //cluster.HapaxLegomena = cluster.HapaxLegomena / options.testClusterSize; //cluster.StopWords = cluster.StopWords / options.testClusterSize; //cluster.TotalWords = cluster.TotalWords / options.testClusterSize; data.Add(cluster); } WriteCSV(data, work, writer); } } if (writeToConsole) { Console.WriteLine(""); ConsoleHelper.Write(ConsoleColor.DarkGray, $"CSV-fil for TEST gemt som {ModelBuilder.TestCSVFile(options)}"); Console.WriteLine(""); } }
public static void writeDataset(List <TextInfo> works, string headers, Options options) { using (var trainWriter = new StreamWriter(ModelBuilder.TrainCSVFile(options), false, Encoding.UTF8)) { trainWriter.WriteLine(headers); using (var testWriter = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8)) { testWriter.WriteLine(headers); foreach (TextInfo work in works) { List <string> trainList = new List <string>(); List <string> testList = new List <string>(); ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}"); var preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8, true); work.TextSummary = new Stylo.TextSummary(work, preProcessedText); int splitPart = 0; if (options.importSplitPct > 0) { splitPart = (int)Math.Round(100 / options.importSplitPct, MidpointRounding.AwayFromZero) - 2; } switch (options.ImportFormat) { case TextImportFormat.SENTENCES: var sentences = Stylo.GetSentences(preProcessedText, options.removeStopWords); bool fillTrainSet = true; int counter = 1; int stepcounter = 0; string ss = ""; for (int i = 0; i < sentences.Count; i++) { string sentence = sentences[i]; if (fillTrainSet) { if (counter < options.trainClusterSize) { ss = ss + sentence + " "; counter++; } else { trainList.Add(ss); stepcounter++; ss = sentence; counter = 1; if (stepcounter == splitPart) { fillTrainSet = false; } } } else { if (counter < options.testClusterSize) { ss = ss + sentence + " "; counter++; } else { testList.Add(ss); ss = sentence; counter = 1; fillTrainSet = true; stepcounter = 0; } } } break; case TextImportFormat.WORDS: var words = Stylo.GetWords(preProcessedText, options.removeStopWords); fillTrainSet = true; counter = 1; stepcounter = 0; ss = ""; for (int i = 0; i < words.Count; i++) { ss += words[i] + " "; if (fillTrainSet) { if (counter < options.trainClusterSize) { counter++; } else { trainList.Add(ss); ss = ""; stepcounter++; counter = 1; if (stepcounter == splitPart) { fillTrainSet = false; } } } else { if (counter < options.testClusterSize) { counter++; } else { testList.Add(ss); counter = 1; fillTrainSet = true; stepcounter = 0; ss = ""; } } } break; } List <ImportDataModel> data = new List <ImportDataModel>(); foreach (string value in trainList) { ImportDataModel cluster = new ImportDataModel(work, value); data.Add(cluster); } WriteCSV(data, work, trainWriter); data = new List <ImportDataModel>(); foreach (string value in testList) { ImportDataModel cluster = new ImportDataModel(work, value); data.Add(cluster); } WriteCSV(data, work, testWriter); } } } }