Ejemplo n.º 1
0
        private static List <DocumentData> HandleDocumentDataListProcessing(string selectedDirectory)
        {
            var processedDocumentDataListJsonPath       = Path.Combine(selectedDirectory, "document_data_list.json");
            var processedDocumentDataListJsonFileExists = File.Exists(processedDocumentDataListJsonPath);

            if (processedDocumentDataListJsonFileExists)
            {
                ConsoleWriteLineWithColor($"Found processed documents file at '{processedDocumentDataListJsonPath}'");
                ConsoleWriteLineWithColor("Would you like to use that in order to skip the processing step?");
                ConsoleWriteLineWithColor("1. Yes");
                ConsoleWriteLineWithColor("2. No");

                var selectedOption = UserInputHandler.GetNumberInputFromUser(new List <int> {
                    1, 2
                });
                if (selectedOption == 1)
                {
                    var processedDocumentDataListJson = File.ReadAllText(processedDocumentDataListJsonPath);
                    var processedDocumentDataList     = JsonConvert.DeserializeObject <List <DocumentData> >(processedDocumentDataListJson);

                    return(processedDocumentDataList);
                }
            }

            var filePathsToUseForDocumentData = new List <string>();

            filePathsToUseForDocumentData.AddRange(Directory.GetFiles(selectedDirectory));
            var documentDataList = documentDataBusinessLogic.GetDocumentDataForMultipleXmlFiles(filePathsToUseForDocumentData);

            if (processedDocumentDataListJsonFileExists)
            {
                ConsoleWriteLineWithColor("Overwrite existing processed documents file?");
                ConsoleWriteLineWithColor("1. Yes");
                ConsoleWriteLineWithColor("2. No");

                var selectedOption = UserInputHandler.GetNumberInputFromUser(new List <int> {
                    1, 2
                });
                if (selectedOption == 1)
                {
                    var documentDataListJson = JsonConvert.SerializeObject(documentDataList);
                    File.WriteAllText(processedDocumentDataListJsonPath, documentDataListJson);
                }
            }
            else
            {
                var documentDataListJson = JsonConvert.SerializeObject(documentDataList);
                File.WriteAllText(processedDocumentDataListJsonPath, documentDataListJson);
            }

            return(documentDataList);
        }
        private void buttonRun_Click(object sender, EventArgs e)
        {
            try
            {
                var documentDataList  = documentDataBusinessLogic.GetDocumentDataForMultipleXmlFiles(filepathsToUseForDocumentData);
                var lists             = SplitListIntoTwoSeparateLists(documentDataList, 70);
                var listForTraining   = lists.Item1;
                var listForValidation = lists.Item2;

                var datasetRepresentationTraining = documentDataList.ToDatasetRepresentation();
                datasetRepresentationTraining = datasetRepresentationTraining.ReconstructByEliminatingWordsBelowAndAboveThresholds(5, 95);

                var stopwatch = new Stopwatch();
                stopwatch.Start();
                var features = featureSelector.GetMostImportantWords(datasetRepresentationTraining);
                stopwatch.Stop();

                var featuresJson = JsonConvert.SerializeObject(features);
                File.WriteAllText("features.json", featuresJson);

                datasetRepresentationTraining = datasetRepresentationTraining.ReconstructByKeepingOnlyTheseWords(features);
                var datasetJson = JsonConvert.SerializeObject(datasetRepresentationTraining);
                var datasetArff = datasetRepresentationTraining.ToArffFileFormat();
                File.WriteAllText("dataset.json", datasetJson);
                File.WriteAllText("dataset.arff", datasetArff);

                topicPredictor.Train(datasetRepresentationTraining);

                double total = listForValidation.Count;
                var    successfullyPredicted = 0;

                foreach (var documentData in listForValidation)
                {
                    var predictedTopic = topicPredictor.PredictTopic(documentData);

                    if (documentData.Topics.Contains(predictedTopic))
                    {
                        successfullyPredicted++;
                    }
                }

                var accuracy = successfullyPredicted / total * 100;

                //documentDataDisplayUserControl.DisplayDocumentData(documentData);
                SetStatusLabel("Done", Color.GreenYellow);
            }
            catch (Exception exception)
            {
                SetStatusLabel("Error", Color.Red);
                //textBoxResult.Text = exception.ToString();
            }
        }