Exemple #1
0
        private static void writeTestDataset(List <TextInfo> works, string headers, Options options,
                                             bool writeToConsole = true)
        {
            if (writeToConsole)
            {
                ConsoleHelper.Write(ConsoleColor.DarkGray, $"Klargør import af {works.Count} tekster til TEST-sæt...");
            }
            using (var writer = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8))
            {
                writer.WriteLine(headers);
                foreach (TextInfo work in works)
                {
                    if (writeToConsole)
                    {
                        ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}");
                    }
                    List <string> list             = new List <string>();
                    var           preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8,
                                                                          true);
                    work.TextSummary = new Stylo.TextSummary(work, preProcessedText);

                    switch (options.ImportFormat)
                    {
                    case TextImportFormat.SENTENCES:
                        var    sentences = Stylo.GetSentences(preProcessedText, options.removeStopWords);
                        int    counter   = 0;
                        string ss        = "";
                        for (int i = 0; i < sentences.Count; i++)
                        {
                            string sentence = sentences[i];
                            if (counter < options.testClusterSize)
                            {
                                ss = ss + sentence + " ";
                                counter++;
                            }
                            else
                            {
                                list.Add(ss);
                                ss      = sentence;
                                counter = 0;
                            }
                        }

                        list.Add(ss);
                        break;

                    case TextImportFormat.WORDS:
                        list = Stylo.getWordClusters(preProcessedText, options.testClusterSize, options.removeStopWords);
                        break;
                        //case TextImportFormat.CHARS:
                        //    list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize);
                        //    break;
                        //default:
                        //    list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize);
                        //    break;
                    }

                    //                    List<string> list = Cluster Sentences.GetSentences(s.Text);
                    List <ImportDataModel> data = new List <ImportDataModel>();
                    foreach (string value in list)
                    {
                        ImportDataModel cluster = new ImportDataModel(work, value);
                        //cluster.DifferentWords = cluster.DifferentWords / options.testClusterSize;
                        //cluster.LongWords = cluster.LongWords / options.testClusterSize;
                        //cluster.HapaxLegomena = cluster.HapaxLegomena / options.testClusterSize;
                        //cluster.StopWords = cluster.StopWords / options.testClusterSize;
                        //cluster.TotalWords = cluster.TotalWords / options.testClusterSize;

                        data.Add(cluster);
                    }

                    WriteCSV(data, work, writer);
                }
            }

            if (writeToConsole)
            {
                Console.WriteLine("");
                ConsoleHelper.Write(ConsoleColor.DarkGray,
                                    $"CSV-fil for TEST gemt som {ModelBuilder.TestCSVFile(options)}");
                Console.WriteLine("");
            }
        }
        public static void PrintImport(ImportResult import, Options options)
        {
            if (options.importTrainData != null)
            {
                ConsoleHelper.Write(ConsoleColor.White, " Tekster i TRÆNING-sættet:");
                var table = new ConsoleTable("Forfatter", "Titel", "Udgivet", "Sætninger", "Ord", "Bogstaver",
                                             "Stopord", "Ord pr. sætning", "Ord pr. sætning u/stopord");
                table.Options.EnableCount = false;

                foreach (var item in import.TrainWork)
                {
                    table.AddRow(item.Author, item.Title, item.Year, $"{item.TextSummary.SentencesCount}", $"{item.TextSummary.WordsCount}",
                                 $"{item.TextSummary.Textlength}", $"{item.TextSummary.StopWordsCount}", $"{(double)item.TextSummary.WordsCount / item.TextSummary.SentencesCount:0.##}",
                                 $"{(double)(item.TextSummary.WordsCount - item.TextSummary.StopWordsCount) / item.TextSummary.SentencesCount:0.##}");
                }

                table.Write();
                Console.WriteLine("");

                var authors = import.TrainWork.Select(o => o.Author).Distinct();
                ConsoleHelper.Write(ConsoleColor.White, "Summarisk oversigt over tekster i TRÆNINGS-sættet:");
                table = new ConsoleTable("Forfatter", "Sætninger", "Ord", "Bogstaver", "Stopord", "Ord pr. sætning",
                                         "Ord pr. sætning u/stopord");
                table.Options.EnableCount = false;
                foreach (var a in authors)
                {
                    int totalsentences = import.TrainWork.Where(item => item.Author == a).Sum(item => item.TextSummary.SentencesCount);
                    int totalwords     = import.TrainWork.Where(item => item.Author == a).Sum(item => item.TextSummary.WordsCount);
                    int totalchars     = import.TrainWork.Where(item => item.Author == a).Sum(item => item.TextSummary.Textlength);
                    int totalstopwords = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.StopWordsCount);
                    table.AddRow(a, totalsentences, totalwords, totalchars, totalstopwords,
                                 $"{(double) totalwords / totalsentences:0.##}",
                                 $"{(double) (totalwords - totalstopwords) / totalsentences:0.##}");
                }

                table.Write();
            }

            if (options.importTestData != null)
            {
                if (options.Split != Split.SPLIT)
                {
                    ConsoleHelper.Write(ConsoleColor.White, " Tekster i TEST-sættet:");
                    var table = new ConsoleTable("Forfatter", "Titel", "Udgivet", "Sætninger", "Ord", "Bogstaver",
                                                 "Stopord", "Ord pr. sætning", "Ord pr. sætning u/stopord");
                    table.Options.EnableCount = false;
                    foreach (var item in import.TestWork)
                    {
                        table.AddRow(item.Author, item.Title, item.Year, $"{item.TextSummary.SentencesCount}", $"{item.TextSummary.WordsCount}",
                                     $"{item.TextSummary.Textlength}", $"{item.TextSummary.StopWordsCount}", $"{(double) item.TextSummary.WordsCount / item.TextSummary.SentencesCount:0.##}",
                                     $"{(double) (item.TextSummary.WordsCount - item.TextSummary.StopWordsCount) / item.TextSummary.SentencesCount:0.##}");
                    }

                    table.Write();
                    Console.WriteLine("");
                    var authors = import.TestWork.Select(o => o.Author).Distinct();
                    ConsoleHelper.Write(ConsoleColor.White, "Summarisk oversigt over tekster i TEST-sættet:");
                    table = new ConsoleTable("Forfatter", "Sætninger", "Ord", "Bogstaver", "Stopord", "Ord pr. sætning",
                                             "Ord pr. sætning u/stopord");
                    table.Options.EnableCount = false;
                    foreach (var a in authors)
                    {
                        int totalsentences =
                            import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.SentencesCount);
                        int totalwords     = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.WordsCount);
                        int totalchars     = import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.Textlength);
                        int totalstopwords =
                            import.TestWork.Where(item => item.Author == a).Sum(item => item.TextSummary.StopWordsCount);

                        table.AddRow(a, totalsentences, totalwords, totalchars, totalstopwords,
                                     $"{(double) totalwords / totalsentences:0.##}",
                                     $"{(double) (totalwords - totalstopwords) / totalsentences:0.##}");
                    }

                    table.Write();
                }
                else
                {
                    ConsoleHelper.Write(ConsoleColor.White,
                                        $"Trænings-sættet splittes og {options.splitPct} % anvendes til TEST");
                }

                Console.WriteLine("");
            }
        }
Exemple #3
0
        public static void writeDataset(List <TextInfo> works, string headers, Options options)
        {
            using (var trainWriter = new StreamWriter(ModelBuilder.TrainCSVFile(options), false, Encoding.UTF8))
            {
                trainWriter.WriteLine(headers);
                using (var testWriter = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8))
                {
                    testWriter.WriteLine(headers);
                    foreach (TextInfo work in works)
                    {
                        List <string> trainList = new List <string>();
                        List <string> testList  = new List <string>();
                        ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}");
                        var preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8,
                                                                    true);
                        work.TextSummary = new Stylo.TextSummary(work, preProcessedText);
                        int splitPart = 0;
                        if (options.importSplitPct > 0)
                        {
                            splitPart = (int)Math.Round(100 / options.importSplitPct, MidpointRounding.AwayFromZero) - 2;
                        }

                        switch (options.ImportFormat)
                        {
                        case TextImportFormat.SENTENCES:
                            var    sentences    = Stylo.GetSentences(preProcessedText, options.removeStopWords);
                            bool   fillTrainSet = true;
                            int    counter      = 1;
                            int    stepcounter  = 0;
                            string ss           = "";
                            for (int i = 0; i < sentences.Count; i++)
                            {
                                string sentence = sentences[i];
                                if (fillTrainSet)
                                {
                                    if (counter < options.trainClusterSize)
                                    {
                                        ss = ss + sentence + " ";
                                        counter++;
                                    }
                                    else
                                    {
                                        trainList.Add(ss);
                                        stepcounter++;
                                        ss      = sentence;
                                        counter = 1;
                                        if (stepcounter == splitPart)
                                        {
                                            fillTrainSet = false;
                                        }
                                    }
                                }
                                else
                                {
                                    if (counter < options.testClusterSize)
                                    {
                                        ss = ss + sentence + " ";
                                        counter++;
                                    }
                                    else
                                    {
                                        testList.Add(ss);
                                        ss           = sentence;
                                        counter      = 1;
                                        fillTrainSet = true;
                                        stepcounter  = 0;
                                    }
                                }
                            }
                            break;

                        case TextImportFormat.WORDS:
                            var words = Stylo.GetWords(preProcessedText, options.removeStopWords);
                            fillTrainSet = true;
                            counter      = 1;
                            stepcounter  = 0;
                            ss           = "";
                            for (int i = 0; i < words.Count; i++)
                            {
                                ss += words[i] + " ";
                                if (fillTrainSet)
                                {
                                    if (counter < options.trainClusterSize)
                                    {
                                        counter++;
                                    }
                                    else
                                    {
                                        trainList.Add(ss);
                                        ss = "";
                                        stepcounter++;
                                        counter = 1;
                                        if (stepcounter == splitPart)
                                        {
                                            fillTrainSet = false;
                                        }
                                    }
                                }
                                else
                                {
                                    if (counter < options.testClusterSize)
                                    {
                                        counter++;
                                    }
                                    else
                                    {
                                        testList.Add(ss);
                                        counter      = 1;
                                        fillTrainSet = true;
                                        stepcounter  = 0;
                                        ss           = "";
                                    }
                                }
                            }
                            break;
                        }


                        List <ImportDataModel> data = new List <ImportDataModel>();
                        foreach (string value in trainList)
                        {
                            ImportDataModel cluster = new ImportDataModel(work, value);
                            data.Add(cluster);
                        }
                        WriteCSV(data, work, trainWriter);

                        data = new List <ImportDataModel>();
                        foreach (string value in testList)
                        {
                            ImportDataModel cluster = new ImportDataModel(work, value);
                            data.Add(cluster);
                        }
                        WriteCSV(data, work, testWriter);
                    }
                }
            }
        }