private static void writeTestDataset(List <TextInfo> works, string headers, Options options, bool writeToConsole = true) { if (writeToConsole) { ConsoleHelper.Write(ConsoleColor.DarkGray, $"Klargør import af {works.Count} tekster til TEST-sæt..."); } using (var writer = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8)) { writer.WriteLine(headers); foreach (TextInfo work in works) { if (writeToConsole) { ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}"); } List <string> list = new List <string>(); var preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8, true); work.TextSummary = new Stylo.TextSummary(work, preProcessedText); switch (options.ImportFormat) { case TextImportFormat.SENTENCES: var sentences = Stylo.GetSentences(preProcessedText, options.removeStopWords); int counter = 0; string ss = ""; for (int i = 0; i < sentences.Count; i++) { string sentence = sentences[i]; if (counter < options.testClusterSize) { ss = ss + sentence + " "; counter++; } else { list.Add(ss); ss = sentence; counter = 0; } } list.Add(ss); break; case TextImportFormat.WORDS: list = Stylo.getWordClusters(preProcessedText, options.testClusterSize, options.removeStopWords); break; //case TextImportFormat.CHARS: // list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize); // break; //default: // list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize); // break; } // List<string> list = Cluster Sentences.GetSentences(s.Text); List <ImportDataModel> data = new List <ImportDataModel>(); foreach (string value in list) { ImportDataModel cluster = new ImportDataModel(work, value); //cluster.DifferentWords = cluster.DifferentWords / options.testClusterSize; //cluster.LongWords = cluster.LongWords / options.testClusterSize; //cluster.HapaxLegomena = cluster.HapaxLegomena / options.testClusterSize; //cluster.StopWords = cluster.StopWords / options.testClusterSize; //cluster.TotalWords = cluster.TotalWords / options.testClusterSize; data.Add(cluster); } WriteCSV(data, work, writer); } } if (writeToConsole) { Console.WriteLine(""); ConsoleHelper.Write(ConsoleColor.DarkGray, $"CSV-fil for TEST gemt som {ModelBuilder.TestCSVFile(options)}"); Console.WriteLine(""); } }
public static void PrintImport(ImportResult import, Options options) { if (options.importTrainData != null) { ConsoleHelper.Write(ConsoleColor.White, " Tekster i TRÆNING-sættet:"); var table = new ConsoleTable("Forfatter", "Titel", "Udgivet", "Sætninger", "Ord", "Bogstaver", "Stopord", "Ord pr. sætning", "Ord pr. sætning u/stopord"); table.Options.EnableCount = false; foreach (var item in import.TrainWork) { table.AddRow(item.Author, item.Title, item.Year, $"{item.TextSummary.SentencesCount}", $"{item.TextSummary.WordsCount}", $"{item.TextSummary.Textlength}", $"{item.TextSummary.StopWordsCount}", $"{(double)item.TextSummary.WordsCount / item.TextSummary.SentencesCount:0.##}", $"{(double)(item.TextSummary.WordsCount - item.TextSummary.StopWordsCount) / item.TextSummary.SentencesCount:0.##}"); } table.Write(); Console.WriteLine(""); var authors = import.TrainWork.Select(o => o.Author).Distinct(); ConsoleHelper.Write(ConsoleColor.White, "Summarisk oversigt over tekster i TRÆNINGS-sættet:"); table = new ConsoleTable("Forfatter", "Sætninger", "Ord", "Bogstaver", "Stopord", "Ord pr. sætning", "Ord pr. sætning u/stopord"); table.Options.EnableCount = false; foreach (var a in authors) { int totalsentences = import.TrainWork.Where(item => item.Author == a).Sum(item => item.TextSummary.SentencesCount); int totalwords = import.TrainWork.Where(item => item.Author == a).Sum(item => item.TextSummary.WordsCount); int totalchars = import.TrainWork.Where(item => item.Author == a).Sum(item => item.TextSummary.Textlength); int totalstopwords = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.StopWordsCount); table.AddRow(a, totalsentences, totalwords, totalchars, totalstopwords, $"{(double) totalwords / totalsentences:0.##}", $"{(double) (totalwords - totalstopwords) / totalsentences:0.##}"); } table.Write(); } if (options.importTestData != null) { if (options.Split != Split.SPLIT) { ConsoleHelper.Write(ConsoleColor.White, " Tekster i TEST-sættet:"); var table = new ConsoleTable("Forfatter", "Titel", "Udgivet", "Sætninger", "Ord", "Bogstaver", "Stopord", "Ord pr. sætning", "Ord pr. sætning u/stopord"); table.Options.EnableCount = false; foreach (var item in import.TestWork) { table.AddRow(item.Author, item.Title, item.Year, $"{item.TextSummary.SentencesCount}", $"{item.TextSummary.WordsCount}", $"{item.TextSummary.Textlength}", $"{item.TextSummary.StopWordsCount}", $"{(double) item.TextSummary.WordsCount / item.TextSummary.SentencesCount:0.##}", $"{(double) (item.TextSummary.WordsCount - item.TextSummary.StopWordsCount) / item.TextSummary.SentencesCount:0.##}"); } table.Write(); Console.WriteLine(""); var authors = import.TestWork.Select(o => o.Author).Distinct(); ConsoleHelper.Write(ConsoleColor.White, "Summarisk oversigt over tekster i TEST-sættet:"); table = new ConsoleTable("Forfatter", "Sætninger", "Ord", "Bogstaver", "Stopord", "Ord pr. sætning", "Ord pr. sætning u/stopord"); table.Options.EnableCount = false; foreach (var a in authors) { int totalsentences = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.SentencesCount); int totalwords = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.WordsCount); int totalchars = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.Textlength); int totalstopwords = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.StopWordsCount); table.AddRow(a, totalsentences, totalwords, totalchars, totalstopwords, $"{(double) totalwords / totalsentences:0.##}", $"{(double) (totalwords - totalstopwords) / totalsentences:0.##}"); } table.Write(); } else { ConsoleHelper.Write(ConsoleColor.White, $"Trænings-sættet splittes og {options.splitPct} % anvendes til TEST"); } Console.WriteLine(""); } }
public static void writeDataset(List <TextInfo> works, string headers, Options options) { using (var trainWriter = new StreamWriter(ModelBuilder.TrainCSVFile(options), false, Encoding.UTF8)) { trainWriter.WriteLine(headers); using (var testWriter = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8)) { testWriter.WriteLine(headers); foreach (TextInfo work in works) { List <string> trainList = new List <string>(); List <string> testList = new List <string>(); ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}"); var preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8, true); work.TextSummary = new Stylo.TextSummary(work, preProcessedText); int splitPart = 0; if (options.importSplitPct > 0) { splitPart = (int)Math.Round(100 / options.importSplitPct, MidpointRounding.AwayFromZero) - 2; } switch (options.ImportFormat) { case TextImportFormat.SENTENCES: var sentences = Stylo.GetSentences(preProcessedText, options.removeStopWords); bool fillTrainSet = true; int counter = 1; int stepcounter = 0; string ss = ""; for (int i = 0; i < sentences.Count; i++) { string sentence = sentences[i]; if (fillTrainSet) { if (counter < options.trainClusterSize) { ss = ss + sentence + " "; counter++; } else { trainList.Add(ss); stepcounter++; ss = sentence; counter = 1; if (stepcounter == splitPart) { fillTrainSet = false; } } } else { if (counter < options.testClusterSize) { ss = ss + sentence + " "; counter++; } else { testList.Add(ss); ss = sentence; counter = 1; fillTrainSet = true; stepcounter = 0; } } } break; case TextImportFormat.WORDS: var words = Stylo.GetWords(preProcessedText, options.removeStopWords); fillTrainSet = true; counter = 1; stepcounter = 0; ss = ""; for (int i = 0; i < words.Count; i++) { ss += words[i] + " "; if (fillTrainSet) { if (counter < options.trainClusterSize) { counter++; } else { trainList.Add(ss); ss = ""; stepcounter++; counter = 1; if (stepcounter == splitPart) { fillTrainSet = false; } } } else { if (counter < options.testClusterSize) { counter++; } else { testList.Add(ss); counter = 1; fillTrainSet = true; stepcounter = 0; ss = ""; } } } break; } List <ImportDataModel> data = new List <ImportDataModel>(); foreach (string value in trainList) { ImportDataModel cluster = new ImportDataModel(work, value); data.Add(cluster); } WriteCSV(data, work, trainWriter); data = new List <ImportDataModel>(); foreach (string value in testList) { ImportDataModel cluster = new ImportDataModel(work, value); data.Add(cluster); } WriteCSV(data, work, testWriter); } } } }