private void GenerateData_SaveGenerationInfoButton_Click(object sender, EventArgs e)
        {
            using (var dialog = DialogEx.SaveFile("Generation info files (*.wsdgeninfo)|*.wsdgeninfo", ".wsdgeninfo"))
            {
                var result = dialog.ShowDialog(this);

                if (result == DialogResult.OK)
                {
                    SystemJsonWriter.Write(dialog.FileName, _generationInfo);

                    RefreshUI(true);

                    MessageBox.Show("Generation info saved successfully.", "Success");
                }
            }
        }
Exemple #2
0
        public static WsdProject CreateAndSave(
            WsdProjectCreateInfo info, string destinationPath, IProgressHandle progress)
        {
            if (info == null)
            {
                throw new ArgumentNullException(nameof(info));
            }

            if (string.IsNullOrEmpty(destinationPath))
            {
                throw new ArgumentNullException(nameof(destinationPath));
            }

            if (PathEx.Identify(destinationPath) != PathIdentity.Directory ||
                Directory.GetFiles(destinationPath, "*", SearchOption.AllDirectories).Length > 0)
            {
                throw new ArgumentException(ExceptionMessage.DestinationPathMustBeEmptyAndExisting);
            }

            if (progress == null)
            {
                throw new ArgumentNullException(nameof(progress));
            }

            info.AssertIsValid();

            progress.SetMessageFormat(MessageFormat.LoadingDictionary_Bytes);

            var dictionary = InputDictionaryReader.ReadAll(info.DictionaryPath, progress);

            progress.SetMessageFormat(MessageFormat.ComputingDictionaryStatistics);

            var dictionaryStatistics = new DictionaryStatistics().Compute(dictionary, progress);

            TextData[] trainData;
            TextData[] testData;

            if (info.DataType == InputDataType.PlainText)
            {
                progress.SetMessageFormat(MessageFormat.LoadingTrainData_Files);

                trainData = InputPlainTextDataReader.ReadAllFiles(info.TrainDataPath, progress);

                progress.SetMessageFormat(MessageFormat.LoadingTestData_Files);

                testData = InputPlainTextDataReader.ReadAllFiles(info.TestDataPath, progress);
            }
            else
            {
                progress.SetMessageFormat(MessageFormat.LoadingSynsetMappings_Bytes);

                var synsetMappings = InputSynsetMappingReader.ReadAll(info.SynsetMappingsPath, progress);

                progress.SetMessageFormat(MessageFormat.LoadingTrainData_Files);

                trainData = InputXmlDataReader.Read(
                    info.TrainDataPath, info.TrainGoldKeyPath, synsetMappings, dictionary,
                    out var trainXmlParseErrors, progress);

                if (trainXmlParseErrors != null && trainXmlParseErrors.Any())
                {
                    XmlParseErrorWriter.WriteAll(
                        Path.Combine(destinationPath, FileName.TrainXmlParseErrors + FileExtension.Text),
                        trainXmlParseErrors);
                }

                progress.SetMessageFormat(MessageFormat.LoadingTestData_Files);

                testData = InputXmlDataReader.Read(
                    info.TestDataPath, info.TestGoldKeyPath, synsetMappings, dictionary,
                    out var testXmlParseErrors, progress);

                if (testXmlParseErrors != null && testXmlParseErrors.Any())
                {
                    XmlParseErrorWriter.WriteAll(
                        Path.Combine(destinationPath, FileName.TestXmlParseErrors + FileExtension.Text),
                        testXmlParseErrors);
                }
            }

            progress.SetMessageFormat(MessageFormat.AnalyzingData_Files);

            var dataAnalysis = new WordAnalysisDictionary()
                               .Analyze(dictionary, trainData, testData, progress);

            progress.SetMessageFormat(MessageFormat.ComputingDataStatistics);

            var dataStatistics = new DataStatistics()
                                 .Compute(dictionary, dataAnalysis, progress);

            progress.SetMessageFormat(MessageFormat.LoadingWordEmbeddings_Bytes);

            var wordEmbeddings = InputEmbeddingReader.ReadAll(
                info.WordEmbeddingsPath, dataAnalysis.GetAllWordOccurrences(), progress);

            var wordEmbeddingStatistics = new EmbeddingStatistics().Compute(wordEmbeddings);

            EmbeddingDictionary meaningEmbeddings = null;

            var meaningEmbeddingStatistics = new EmbeddingStatistics();

            if (!string.IsNullOrWhiteSpace(info.MeaningEmbeddingsPath))
            {
                progress.SetMessageFormat(MessageFormat.LoadingMeaningEmbeddings_Bytes);

                meaningEmbeddings = InputEmbeddingReader.ReadAll(
                    info.MeaningEmbeddingsPath, dataAnalysis.GetAllMeaningOccurrences(), progress);

                meaningEmbeddingStatistics.Compute(meaningEmbeddings);
            }

            var projectInfo = new WsdProjectInfo
            {
                ProjectName        = Path.GetFileName(destinationPath),
                ProjectVersion     = CurrentProjectVersion,
                ApplicationVersion = typeof(WsdProject).Assembly.GetName().Version.ToString(),
                Dictionary         = FileName.Dictionary + FileExtension.WsdData,
                TrainData          = trainData.Select(x => new WsdProjectTextDataInfo
                {
                    Name = x.TextName,
                    Path = Path.Combine(FolderName.Train, x.TextName + FileExtension.WsdData)
                }).ToArray(),
                TestData = testData.Select(x => new WsdProjectTextDataInfo
                {
                    Name = x.TextName,
                    Path = Path.Combine(FolderName.Test, x.TextName + FileExtension.WsdData)
                }).ToArray(),
                WordEmbeddings    = FileName.WordEmbeddings + FileExtension.WsdData,
                MeaningEmbeddings = meaningEmbeddings != null
                    ? FileName.MeaningEmbeddings + FileExtension.WsdData
                    : string.Empty,
                DataAnalysis                = FileName.DataAnalysis + FileExtension.WsdData,
                DictionaryStatistics        = FileName.DictionaryStatistics + FileExtension.WsdData,
                DataStatistics              = FileName.DataStatistics + FileExtension.WsdData,
                WordEmbeddingsStatistics    = FileName.WordEmbeddingsStatistics + FileExtension.WsdData,
                MeaningEmbeddingsStatistics = FileName.MeaningEmbeddingsStatistics + FileExtension.WsdData
            };

            progress.SetMessageFormat(MessageFormat.SavingDictionary_Words);

            SystemDictionaryWriter.WriteAll(
                Path.Combine(destinationPath, projectInfo.Dictionary), dictionary, progress);

            progress.SetMessageFormat(MessageFormat.SavingTrainData_Files);

            SystemDataWriter.WriteAllFiles(
                destinationPath,
                projectInfo.TrainData
                .Select(x => (x.Path, trainData.Single(y => y.TextName == x.Name).Data))
                .ToArray(),
                progress);

            progress.SetMessageFormat(MessageFormat.SavingTestData_Files);

            SystemDataWriter.WriteAllFiles(
                destinationPath,
                projectInfo.TestData
                .Select(x => (x.Path, testData.Single(y => y.TextName == x.Name).Data))
                .ToArray(),
                progress);

            progress.SetMessageFormat(MessageFormat.SavingWordEmbeddings_Embeddings);

            SystemEmbeddingWriter.WriteAll(
                Path.Combine(destinationPath, projectInfo.WordEmbeddings), wordEmbeddings, progress);

            if (meaningEmbeddings != null)
            {
                progress.SetMessageFormat(MessageFormat.SavingMeaningEmbeddings_Embeddings);

                SystemEmbeddingWriter.WriteAll(
                    Path.Combine(destinationPath, projectInfo.MeaningEmbeddings), meaningEmbeddings, progress);
            }

            progress.SetMessageFormat(MessageFormat.SavingDataAnalysis_Words);

            SystemDataAnalysisWriter.WriteAll(
                Path.Combine(destinationPath, projectInfo.DataAnalysis), dataAnalysis, progress);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.DictionaryStatistics), dictionaryStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.DataStatistics), dataStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.WordEmbeddingsStatistics), wordEmbeddingStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.MeaningEmbeddingsStatistics),
                meaningEmbeddingStatistics);

            SystemJsonWriter.Write(
                Path.Combine(destinationPath, projectInfo.ProjectName + FileExtension.WsdProj),
                projectInfo);

            return(new WsdProject(
                       projectInfo, dictionary, trainData, testData, wordEmbeddings, meaningEmbeddings,
                       dataAnalysis, dictionaryStatistics, dataStatistics, wordEmbeddingStatistics,
                       meaningEmbeddingStatistics));
        }
        public void Generate(
            WsdProject project, GenerationInfo info, IProgressHandle progress)
        {
            if (info == null)
            {
                throw new ArgumentNullException(nameof(info));
            }

            if (PathEx.Identify(info.DestinationFolder) != PathIdentity.Directory ||
                Directory.GetFiles(info.DestinationFolder, "*", SearchOption.AllDirectories).Length > 0)
            {
                throw new ArgumentException("DestinationFolder must be an empty existing directory.");
            }

            info.AssertIsValid();

            var handlers = _dataGenerationHandlers
                           .OrderBy(x => x.GetExecutionPriority(project))
                           .ToArray();

            foreach (var handler in handlers)
            {
                handler.BeforeGenerationStarted(project, info, progress);
            }

            var reorderedDictionary = _classDeterminator.GetReorderedDictionary(project, info, progress);

            foreach (var handler in handlers)
            {
                handler.AfterDictionaryReordered(reorderedDictionary, project, info, progress);
            }

            var dataSets = new Dictionary <DataSetName, DataSetByText>
            {
                [DataSetName.Train] = new DataSetByText(
                    DataSetName.Train,
                    _generationAlgorithm.GenerateRecords(project.TrainData, project, info, progress)),

                [DataSetName.Test] = new DataSetByText(
                    DataSetName.Test,
                    _generationAlgorithm.GenerateRecords(project.TestData, project, info, progress))
            };

            foreach (var handler in handlers)
            {
                handler.AfterRecordsGenerated(dataSets, project, info, progress);
            }

            var dataSetGroups = _dataSetGrouper.FormGroups(dataSets, project, info, progress);

            foreach (var handler in handlers)
            {
                handler.AfterGroupsFormed(dataSetGroups, project, info, progress);
            }

            _testOnlySetExtractor.Extract(dataSetGroups, project, info, progress);

            foreach (var handler in handlers)
            {
                handler.AfterTestOnlySetExtracted(dataSetGroups, project, info, progress);
            }

            if (info.ExtractValidationSet)
            {
                _validationSetExtractor.Extract(dataSetGroups, info, progress);

                foreach (var handler in handlers)
                {
                    handler.AfterValidationSetExtracted(dataSetGroups, project, info, progress);
                }
            }

            if (info.ShuffleData)
            {
                _dataSetShuffler.ShuffleData(dataSetGroups, progress);

                foreach (var handler in handlers)
                {
                    handler.AfterDataShuffled(dataSetGroups, project, info, progress);
                }
            }

            var context = new FeatureSelectionContext
            {
                GenerationInfo      = info,
                ReorderedDictionary = reorderedDictionary,
                FilteredPosList     = new WsdPosList(info.FilteredPosList),
                Project             = project
            };

            foreach (var handler in handlers)
            {
                handler.BeforeDataWritten(dataSetGroups, project, info, progress);
            }

            _dataSetWriter.WriteData(info.DestinationFolder, dataSetGroups, context, progress);

            SystemJsonWriter.Write(
                Path.Combine(
                    info.DestinationFolder,
                    FileName.GenerationInfo + FileExtension.WsdGenInfo),
                info);

            SystemJsonWriter.Write(
                Path.Combine(
                    info.DestinationFolder,
                    FileName.GenerationInfo + FileExtension.Text),
                new GenerationInfoReadable(info),
                null, false);

            foreach (var handler in handlers)
            {
                handler.AfterGenerationCompleted(project, info, progress);
            }
        }