Exemplo n.º 1
0
        private static void writeTestDataset(List <TextInfo> works, string headers, Options options,
                                             bool writeToConsole = true)
        {
            if (writeToConsole)
            {
                ConsoleHelper.Write(ConsoleColor.DarkGray, $"Klargør import af {works.Count} tekster til TEST-sæt...");
            }
            using (var writer = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8))
            {
                writer.WriteLine(headers);
                foreach (TextInfo work in works)
                {
                    if (writeToConsole)
                    {
                        ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}");
                    }
                    List <string> list             = new List <string>();
                    var           preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8,
                                                                          true);
                    work.TextSummary = new Stylo.TextSummary(work, preProcessedText);

                    switch (options.ImportFormat)
                    {
                    case TextImportFormat.SENTENCES:
                        var    sentences = Stylo.GetSentences(preProcessedText, options.removeStopWords);
                        int    counter   = 0;
                        string ss        = "";
                        for (int i = 0; i < sentences.Count; i++)
                        {
                            string sentence = sentences[i];
                            if (counter < options.testClusterSize)
                            {
                                ss = ss + sentence + " ";
                                counter++;
                            }
                            else
                            {
                                list.Add(ss);
                                ss      = sentence;
                                counter = 0;
                            }
                        }

                        list.Add(ss);
                        break;

                    case TextImportFormat.WORDS:
                        list = Stylo.getWordClusters(preProcessedText, options.testClusterSize, options.removeStopWords);
                        break;
                        //case TextImportFormat.CHARS:
                        //    list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize);
                        //    break;
                        //default:
                        //    list = Stylo.getCharacterClusters(preProcessedText, options.testClusterSize);
                        //    break;
                    }

                    //                    List<string> list = Cluster Sentences.GetSentences(s.Text);
                    List <ImportDataModel> data = new List <ImportDataModel>();
                    foreach (string value in list)
                    {
                        ImportDataModel cluster = new ImportDataModel(work, value);
                        //cluster.DifferentWords = cluster.DifferentWords / options.testClusterSize;
                        //cluster.LongWords = cluster.LongWords / options.testClusterSize;
                        //cluster.HapaxLegomena = cluster.HapaxLegomena / options.testClusterSize;
                        //cluster.StopWords = cluster.StopWords / options.testClusterSize;
                        //cluster.TotalWords = cluster.TotalWords / options.testClusterSize;

                        data.Add(cluster);
                    }

                    WriteCSV(data, work, writer);
                }
            }

            if (writeToConsole)
            {
                Console.WriteLine("");
                ConsoleHelper.Write(ConsoleColor.DarkGray,
                                    $"CSV-fil for TEST gemt som {ModelBuilder.TestCSVFile(options)}");
                Console.WriteLine("");
            }
        }
Exemplo n.º 2
0
        public static void writeDataset(List <TextInfo> works, string headers, Options options)
        {
            using (var trainWriter = new StreamWriter(ModelBuilder.TrainCSVFile(options), false, Encoding.UTF8))
            {
                trainWriter.WriteLine(headers);
                using (var testWriter = new StreamWriter(ModelBuilder.TestCSVFile(options), false, Encoding.UTF8))
                {
                    testWriter.WriteLine(headers);
                    foreach (TextInfo work in works)
                    {
                        List <string> trainList = new List <string>();
                        List <string> testList  = new List <string>();
                        ConsoleHelper.Write(ConsoleColor.DarkGray, $"---> {work.Title}");
                        var preProcessedText = Stylo.PreProcessText(Path.Combine(@"..\..\Tekster\", work.Filename), Encoding.UTF8,
                                                                    true);
                        work.TextSummary = new Stylo.TextSummary(work, preProcessedText);
                        int splitPart = 0;
                        if (options.importSplitPct > 0)
                        {
                            splitPart = (int)Math.Round(100 / options.importSplitPct, MidpointRounding.AwayFromZero) - 2;
                        }

                        switch (options.ImportFormat)
                        {
                        case TextImportFormat.SENTENCES:
                            var    sentences    = Stylo.GetSentences(preProcessedText, options.removeStopWords);
                            bool   fillTrainSet = true;
                            int    counter      = 1;
                            int    stepcounter  = 0;
                            string ss           = "";
                            for (int i = 0; i < sentences.Count; i++)
                            {
                                string sentence = sentences[i];
                                if (fillTrainSet)
                                {
                                    if (counter < options.trainClusterSize)
                                    {
                                        ss = ss + sentence + " ";
                                        counter++;
                                    }
                                    else
                                    {
                                        trainList.Add(ss);
                                        stepcounter++;
                                        ss      = sentence;
                                        counter = 1;
                                        if (stepcounter == splitPart)
                                        {
                                            fillTrainSet = false;
                                        }
                                    }
                                }
                                else
                                {
                                    if (counter < options.testClusterSize)
                                    {
                                        ss = ss + sentence + " ";
                                        counter++;
                                    }
                                    else
                                    {
                                        testList.Add(ss);
                                        ss           = sentence;
                                        counter      = 1;
                                        fillTrainSet = true;
                                        stepcounter  = 0;
                                    }
                                }
                            }
                            break;

                        case TextImportFormat.WORDS:
                            var words = Stylo.GetWords(preProcessedText, options.removeStopWords);
                            fillTrainSet = true;
                            counter      = 1;
                            stepcounter  = 0;
                            ss           = "";
                            for (int i = 0; i < words.Count; i++)
                            {
                                ss += words[i] + " ";
                                if (fillTrainSet)
                                {
                                    if (counter < options.trainClusterSize)
                                    {
                                        counter++;
                                    }
                                    else
                                    {
                                        trainList.Add(ss);
                                        ss = "";
                                        stepcounter++;
                                        counter = 1;
                                        if (stepcounter == splitPart)
                                        {
                                            fillTrainSet = false;
                                        }
                                    }
                                }
                                else
                                {
                                    if (counter < options.testClusterSize)
                                    {
                                        counter++;
                                    }
                                    else
                                    {
                                        testList.Add(ss);
                                        counter      = 1;
                                        fillTrainSet = true;
                                        stepcounter  = 0;
                                        ss           = "";
                                    }
                                }
                            }
                            break;
                        }


                        List <ImportDataModel> data = new List <ImportDataModel>();
                        foreach (string value in trainList)
                        {
                            ImportDataModel cluster = new ImportDataModel(work, value);
                            data.Add(cluster);
                        }
                        WriteCSV(data, work, trainWriter);

                        data = new List <ImportDataModel>();
                        foreach (string value in testList)
                        {
                            ImportDataModel cluster = new ImportDataModel(work, value);
                            data.Add(cluster);
                        }
                        WriteCSV(data, work, testWriter);
                    }
                }
            }
        }