Пример #1
0
        public T[] ReadAll(IProgressHandle progress = null)
        {
            var scope = progress?.Scope(BaseReader.BaseStream.Length);

            try
            {
                var list = new List <T>();

                while (!BaseReader.EndOfStream)
                {
                    var data = Read();

                    if (data != null)
                    {
                        list.Add(data);
                    }

                    scope?.TrySet(BaseReader.BaseStream.Position);
                }

                return(list.ToArray());
            }
            finally
            {
                scope?.Dispose();
            }
        }
Пример #2
0
        public static T Read <T>(string path, IProgressHandle progress = null)
        {
            if (string.IsNullOrEmpty(path))
            {
                throw new ArgumentNullException(nameof(path));
            }

            var scope = progress?.Scope(1);

            try
            {
                var data = File.ReadAllText(path);

                try
                {
                    return(JsonConvert.DeserializeObject <T>(data, new JsonSerializerSettings
                    {
                        Formatting = Formatting.Indented,
                        TypeNameHandling = TypeNameHandling.Objects
                    }));
                }
                catch (Exception ex)
                {
                    throw new Exception(ExceptionMessage.UnableToLoadProjectData, ex);
                }
            }
            finally
            {
                scope?.Dispose();
            }
        }
Пример #3
0
        public static void Write <T>(
            string path, T data, IProgressHandle progress = null, bool includeTypeNames = true)
        {
            if (string.IsNullOrEmpty(path))
            {
                throw new ArgumentNullException(nameof(path));
            }

            if (data == null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            var scope = progress?.Scope(1);

            try
            {
                var json = JsonConvert.SerializeObject(data, new JsonSerializerSettings
                {
                    Formatting       = Formatting.Indented,
                    TypeNameHandling = includeTypeNames
                        ? TypeNameHandling.Objects
                        : TypeNameHandling.None
                });

                File.WriteAllText(path, json);
            }
            finally
            {
                scope?.Dispose();
            }
        }
        public static TextData[] ReadAllFiles(string path, IProgressHandle progress = null)
        {
            var dataFiles = Directory.GetFiles(path);
            var scope     = progress?.Scope(dataFiles.Length);

            try
            {
                var result = new List <TextData>();

                for (var i = 0; i < dataFiles.Length; i++)
                {
                    var file     = dataFiles[i];
                    var textName = Path.GetFileNameWithoutExtension(file);

                    result.Add(new TextData(textName, ReadAll(file)));

                    scope.TrySet(i + 1);
                }

                return(result.ToArray());
            }
            finally
            {
                scope?.Dispose();
            }
        }
        public WordAnalysisDictionary Analyze(
            WordDictionary dictionary, TextData[] trainData, TextData[] testData,
            IProgressHandle progress = null)
        {
            var max     = trainData.Length + testData.Length;
            var scope   = progress?.Scope(max);
            var counter = 0;

            try
            {
                foreach (var text in trainData)
                {
                    foreach (var encounter in text.Data)
                    {
                        if (string.IsNullOrWhiteSpace(encounter.Word) ||
                            string.IsNullOrWhiteSpace(encounter.Meaning) ||
                            encounter.Word == RawWordEncounter.EmptyWord ||
                            encounter.Word == RawWordEncounter.EndOfSentence)
                        {
                            continue;
                        }

                        var wordAnalysis = GetOrAdd(dictionary, encounter);

                        wordAnalysis.TrainEncounters.AddEncounter(dictionary, encounter);
                        wordAnalysis.AllEncounters.AddEncounter(dictionary, encounter);
                    }

                    scope?.TrySet(++counter);
                }

                foreach (var text in testData)
                {
                    foreach (var encounter in text.Data)
                    {
                        if (string.IsNullOrWhiteSpace(encounter.Word) ||
                            string.IsNullOrWhiteSpace(encounter.Meaning) ||
                            encounter.Word == RawWordEncounter.EmptyWord ||
                            encounter.Word == RawWordEncounter.EndOfSentence)
                        {
                            continue;
                        }

                        var wordAnalysis = GetOrAdd(dictionary, encounter);

                        wordAnalysis.TestEncounters.AddEncounter(dictionary, encounter);
                        wordAnalysis.AllEncounters.AddEncounter(dictionary, encounter);
                    }

                    scope?.TrySet(++counter);
                }
            }
            finally
            {
                scope?.Dispose();
            }

            return(this);
        }
        public void Extract(
            IList <DataSetGroup> dataSetGroups, GenerationInfo info, IProgressHandle progress)
        {
            using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingValidationSet_Groups))
            {
                var counter = 0;

                foreach (var dataSetGroup in dataSetGroups)
                {
                    scope.TrySet(counter++);

                    var oldTrainSet = dataSetGroup.DataSets.GetByName(DataSetName.Train);

                    if (oldTrainSet == null)
                    {
                        continue;
                    }

                    var validationSplit = oldTrainSet.Data
                                          .GroupBy(x => x.Word + x.Meaning)
                                          .Select(x =>
                    {
                        var groupCount           = x.Count();
                        var validationGroupCount = (int)Math.Ceiling(
                            groupCount * 0.01 * info.ValidationSetPercentage);

                        return(new
                        {
                            TrainGroup = x.Skip(validationGroupCount).ToArray(),
                            ValidationGroup = x.Take(validationGroupCount).ToArray()
                        });
                    })
                                          .ToArray();

                    var trainExamples      = validationSplit.SelectMany(x => x.TrainGroup).ToArray();
                    var validationExamples = validationSplit.SelectMany(x => x.ValidationGroup).ToArray();

                    if (trainExamples.Length > 0)
                    {
                        dataSetGroup.DataSets[DataSetName.Train] = new DataSet(DataSetName.Train, trainExamples);
                    }
                    else
                    {
                        dataSetGroup.DataSets.Remove(DataSetName.Train);
                    }

                    if (validationExamples.Length > 0)
                    {
                        dataSetGroup.DataSets[DataSetName.Validation] = new DataSet(
                            DataSetName.Validation, validationExamples);
                    }
                    else
                    {
                        dataSetGroup.DataSets.Remove(DataSetName.Validation);
                    }
                }
            }
        }
Пример #7
0
        public void BeforeDataWritten(
            IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info,
            IProgressHandle progress)
        {
            var config = project.PluginData.GetData <StatisticsPlugin, StatisticsConfig>(string.Empty);

            if (!config.PluginEnabled)
            {
                return;
            }

            var dictionary = project.PluginData.GetData <StatisticsPlugin, WordDictionary>(string.Empty);
            var filePath   = Path.Combine(
                info.DestinationFolder, FileName.DataSetStatistics + FileExtension.Csv);

            using (var streamWriter = new StreamWriter(filePath))
                using (var writer = new CsvWriter(streamWriter))
                    using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ComputingStatistics_Groups))
                    {
                        writer.WriteLine(
                            "Group", "Train examples", "Validation examples", "Test examples", "Test-only examples",
                            "Majority vote", "Train classes", "Test classes", "Train entropy", "Test entropy");

                        for (var i = 0; i < dataSetGroups.Count; i++)
                        {
                            scope.TrySet(i);

                            var dataSetGroup = dataSetGroups[i];
                            var statistics   = DataSetGroupStatistics.Compute(dictionary, dataSetGroup);

                            if (config.RequireTrainingSet && statistics.TrainExamples == 0 ||
                                config.RequireTestSet && statistics.TestExamples == 0 ||
                                statistics.TrainExamples + statistics.ValidationExamples <
                                config.MinimumTrainingValidationExamples)
                            {
                                continue;
                            }

                            writer.WriteLine(
                                dataSetGroup.GroupName, statistics.TrainExamples, statistics.ValidationExamples,
                                statistics.TestExamples, statistics.TestOnlyExamples, statistics.MajorityVote,
                                statistics.TrainClasses, statistics.TestClasses, statistics.TrainEntropy,
                                statistics.TestEntropy);
                        }
                    }

            if (config.AbortGenerationAfterStatisticsAreComputed)
            {
                throw new OperationCanceledException();
            }
        }
Пример #8
0
        public void Extract(
            IList <DataSetGroup> dataSetGroups, WsdProject project, GenerationInfo info,
            IProgressHandle progress)
        {
            using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingTestOnlySet_Groups))
            {
                var counter = 0;

                foreach (var dataSetGroup in dataSetGroups)
                {
                    scope.TrySet(counter++);

                    var oldTestSet = dataSetGroup.DataSets.GetByName(DataSetName.Test);

                    if (oldTestSet == null)
                    {
                        continue;
                    }

                    var testExamples = oldTestSet.Data
                                       .Where(x => project.DataAnalysis[x.Word].TrainEncounters.Any())
                                       .ToArray();

                    var testOnlyExamples = oldTestSet.Data
                                           .Where(x => !project.DataAnalysis[x.Word].TrainEncounters.Any())
                                           .ToArray();

                    if (testExamples.Length > 0)
                    {
                        dataSetGroup.DataSets[DataSetName.Test] = new DataSet(DataSetName.Test, testExamples);
                    }
                    else
                    {
                        dataSetGroup.DataSets.Remove(DataSetName.Test);
                    }

                    if (testOnlyExamples.Length > 0)
                    {
                        dataSetGroup.DataSets[DataSetName.TestOnly] =
                            new DataSet(DataSetName.TestOnly, testOnlyExamples);
                    }
                    else
                    {
                        dataSetGroup.DataSets.Remove(DataSetName.TestOnly);
                    }
                }
            }
        }
Пример #9
0
        public void ShuffleData(IList <DataSetGroup> dataSetGroups, IProgressHandle progress)
        {
            using (var scope = progress.Scope(dataSetGroups.Count, MessageFormat.ExtractingValidationSet_Groups))
            {
                var counter = 0;

                foreach (var dataSetGroup in dataSetGroups)
                {
                    scope.TrySet(counter++);

                    foreach (var dataSet in dataSetGroup.DataSets.Values)
                    {
                        dataSet.Data.Shuffle();
                    }
                }
            }
        }
Пример #10
0
        public static PluginInfo[] LoadPlugins(IProgressHandle progress)
        {
            var plugins       = new List <PluginInfo>();
            var assemblyFiles = Directory.GetFiles(PluginDirectory, "*.dll", SearchOption.AllDirectories);

            using (var scope = progress.Scope(assemblyFiles.Length, MessageFormat.LoadingPlugins_Files))
            {
                var counter = 0;

                foreach (var assemblyFile in assemblyFiles)
                {
                    try
                    {
                        var assembly    = Assembly.LoadFile(assemblyFile);
                        var pluginTypes = assembly.GetTypes()
                                          .Where(x => typeof(IPlugin).IsAssignableFrom(x))
                                          .ToArray();

                        foreach (var pluginType in pluginTypes)
                        {
                            try
                            {
                                var pluginInstance   = (IPlugin)Activator.CreateInstance(pluginType);
                                var pluginComponents = pluginInstance.GetComponents() ?? new IPluginComponent[0];

                                plugins.Add(new PluginInfo(
                                                true, assemblyFile, assembly.GetName(),
                                                pluginInstance, pluginComponents));
                            }
                            catch
                            {
                                plugins.Add(new PluginInfo(
                                                true, assemblyFile, assembly.GetName(), null, null));
                            }
                        }
                    }
                    catch
                    {
                    }

                    scope.TrySet(++counter);
                }
            }

            return(plugins.ToArray());
        }
Пример #11
0
        public void WriteAll(IList <T> list, IProgressHandle progress = null)
        {
            var scope = progress?.Scope(list.Count);

            try
            {
                for (var i = 0; i < list.Count; i++)
                {
                    Write(list[i]);

                    scope?.TrySet(i + 1);
                }
            }
            finally
            {
                scope?.Dispose();
            }
        }
Пример #12
0
        public IList <GeneratedTextData> GenerateRecords(
            IList <TextData> data, WsdProject project, GenerationInfo info, IProgressHandle progress)
        {
            var result = new List <GeneratedTextData>();

            using (var scope = progress.Scope(data.Count, MessageFormat.GeneratingRecords_Texts))
            {
                for (var i = 0; i < data.Count; i++)
                {
                    scope.TrySet(i);

                    result.Add(new GeneratedTextData(
                                   data[i].TextName, GenerateRecords(data[i].Data, project, info)));
                }
            }

            return(result);
        }
Пример #13
0
        public DictionaryStatistics Compute(WordDictionary dictionary, IProgressHandle progress = null)
        {
            var scope = progress?.Scope(1);

            try
            {
                WordCount              = dictionary.Count;
                MonosemanticWordCount  = dictionary.Values.Count(x => x.Meanings.Count == 1);
                PolysemanticWordCount  = dictionary.Values.Count(x => x.Meanings.Count > 1);
                MaxMeaningsPerWord     = dictionary.Values.MaxOrDefault(x => x.Meanings.Count);
                AverageMeaningsPerWord = dictionary.Values.AverageOrDefault(x => x.Meanings.Count);
                UniqueMeaningsCount    = dictionary.Values.SelectMany(x => x.Meanings.Keys).Distinct().Count();
            }
            finally
            {
                scope?.Dispose();
            }

            return(this);
        }
Пример #14
0
        public DataStatistics Compute(
            WordDictionary dictionary, WordAnalysisDictionary dataAnalysis, IProgressHandle progress = null)
        {
            var scope = progress?.Scope(1);

            try
            {
                MonosemanticTrainExamples = dataAnalysis.Values
                                            .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1)
                                            .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters));

                PolysemanticTrainExamples = dataAnalysis.Values
                                            .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                            .Sum(x => x.TrainEncounters.Values.Sum(y => y.Encounters));

                MonosemanticTestExamples = dataAnalysis.Values
                                           .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) == 1)
                                           .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters));

                PolysemanticTestExamples = dataAnalysis.Values
                                           .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                           .Sum(x => x.TestEncounters.Values.Sum(y => y.Encounters));

                var commonAnalysis = dataAnalysis.Values
                                     .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                     .Where(x => x.TrainEncounters.Count != 0)
                                     .SelectMany(x => x.TestEncounters
                                                 .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                     .ToArray();

                var learnableAnalysis = dataAnalysis.Values
                                        .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                        .Where(x => x.TrainEncounters.Count != 0)
                                        .SelectMany(x => x.TestEncounters
                                                    .Where(y => x.TrainEncounters.ContainsKey(y.Key))
                                                    .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                        .ToArray();

                var nonLearnableAnalysis = dataAnalysis.Values
                                           .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                           .Where(x => x.TrainEncounters.Count != 0)
                                           .SelectMany(x => x.TestEncounters
                                                       .Where(y => !x.TrainEncounters.ContainsKey(y.Key))
                                                       .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                           .ToArray();

                var testOnlyAnalysis = dataAnalysis.Values
                                       .Where(x => (dictionary.GetByName(x.Word)?.Meanings.Count ?? 0) > 1)
                                       .Where(x => x.TrainEncounters.Count == 0)
                                       .SelectMany(x => x.TestEncounters
                                                   .Select(y => new
                {
                    x.Word,
                    y.Value.Meaning,
                    y.Value.PartOfSpeech,
                    y.Value.Encounters
                }))
                                       .ToArray();

                CommonTestExamples       = commonAnalysis.Sum(x => x.Encounters);
                LearnableTestExamples    = learnableAnalysis.Sum(x => x.Encounters);
                NonLearnableTestExamples = nonLearnableAnalysis.Sum(x => x.Encounters);
                TestOnlyExamples         = testOnlyAnalysis.Sum(x => x.Encounters);

                CorrectDictionaryBasedLearnableGuesses = learnableAnalysis
                                                         .Where(x => dictionary
                                                                .GetByName(x.Word)
                                                                .Meanings.Values
                                                                .OrderByDescending(y => y.Encounters)
                                                                .First()
                                                                .Meaning ==
                                                                x.Meaning)
                                                         .Sum(x => x.Encounters);

                CorrectDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis
                                                            .Where(x => dictionary
                                                                   .GetByName(x.Word)
                                                                   .Meanings.Values
                                                                   .OrderByDescending(y => y.Encounters)
                                                                   .First()
                                                                   .Meaning ==
                                                                   x.Meaning)
                                                            .Sum(x => x.Encounters);

                CorrectDictionaryBasedTestOnlyGuesses = testOnlyAnalysis
                                                        .Where(x => dictionary
                                                               .GetByName(x.Word)
                                                               .Meanings.Values
                                                               .OrderByDescending(y => y.Encounters)
                                                               .First()
                                                               .Meaning ==
                                                               x.Meaning)
                                                        .Sum(x => x.Encounters);

                CorrectPosDictionaryBasedLearnableGuesses = learnableAnalysis
                                                            .Where(x => dictionary
                                                                   .GetByName(x.Word)
                                                                   .Meanings.Values
                                                                   .OrderByDescending(y => y.Encounters)
                                                                   .First(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                   .Meaning ==
                                                                   x.Meaning)
                                                            .Sum(x => x.Encounters);

                CorrectPosDictionaryBasedNonLearnableGuesses = nonLearnableAnalysis
                                                               .Where(x => dictionary
                                                                      .GetByName(x.Word)
                                                                      .Meanings.Values
                                                                      .OrderByDescending(y => y.Encounters)
                                                                      .First(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                      .Meaning ==
                                                                      x.Meaning)
                                                               .Sum(x => x.Encounters);

                CorrectPosDictionaryBasedTestOnlyGuesses = testOnlyAnalysis
                                                           .Where(x => dictionary
                                                                  .GetByName(x.Word)
                                                                  .Meanings.Values
                                                                  .OrderByDescending(y => y.Encounters)
                                                                  .First(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                  .Meaning ==
                                                                  x.Meaning)
                                                           .Sum(x => x.Encounters);

                CorrectTrainingBasedLearnableGuesses = learnableAnalysis
                                                       .Where(x => dataAnalysis[x.Word]
                                                              .TrainEncounters
                                                              .Values
                                                              .OrderByDescending(y => y.Encounters)
                                                              .FirstOrDefault()
                                                              .Meaning ==
                                                              x.Meaning)
                                                       .Sum(x => x.Encounters);

                CorrectPosTrainingBasedLearnableGuesses = learnableAnalysis
                                                          .Where(x => dataAnalysis[x.Word]
                                                                 .TrainEncounters
                                                                 .Values
                                                                 .Where(y => y.PartOfSpeech == x.PartOfSpeech)
                                                                 .OrderByDescending(y => y.Encounters)
                                                                 .FirstOrDefault()
                                                                 .Meaning ==
                                                                 x.Meaning)
                                                          .Sum(x => x.Encounters);

                FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses +
                                                CorrectDictionaryBasedNonLearnableGuesses) /
                                               (double)CommonTestExamples;

                All_FirstSenseDictionaryBaseline = (CorrectDictionaryBasedLearnableGuesses +
                                                    CorrectDictionaryBasedNonLearnableGuesses +
                                                    CorrectDictionaryBasedTestOnlyGuesses +
                                                    MonosemanticTestExamples) /
                                                   (double)(PolysemanticTestExamples + MonosemanticTestExamples);

                FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses +
                                                   CorrectPosDictionaryBasedNonLearnableGuesses) /
                                                  (double)CommonTestExamples;

                All_FirstSensePosDictionaryBaseline = (CorrectPosDictionaryBasedLearnableGuesses +
                                                       CorrectPosDictionaryBasedNonLearnableGuesses +
                                                       CorrectPosDictionaryBasedTestOnlyGuesses +
                                                       MonosemanticTestExamples) /
                                                      (double)(PolysemanticTestExamples +
                                                               MonosemanticTestExamples);

                FirstSenseBaseline = CorrectTrainingBasedLearnableGuesses /
                                     (double)CommonTestExamples;

                All_FirstSenseBaseline = (CorrectTrainingBasedLearnableGuesses + MonosemanticTestExamples) /
                                         (double)(PolysemanticTestExamples + MonosemanticTestExamples);

                FirstSensePosBaseline = CorrectPosTrainingBasedLearnableGuesses /
                                        (double)CommonTestExamples;

                All_FirstSensePosBaseline = (CorrectPosTrainingBasedLearnableGuesses + MonosemanticTestExamples) /
                                            (double)(PolysemanticTestExamples + MonosemanticTestExamples);

                BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses +
                                    LearnableTestExamples) /
                                   (double)PolysemanticTestExamples;

                All_BestCaseBaseline = (CorrectDictionaryBasedTestOnlyGuesses +
                                        LearnableTestExamples + MonosemanticTestExamples) /
                                       (double)(PolysemanticTestExamples + MonosemanticTestExamples);
            }
            finally
            {
                scope?.Dispose();
            }

            return(this);
        }
Пример #15
0
        public WordDictionary GetReorderedDictionary(
            WsdProject project, GenerationInfo info, IProgressHandle progress)
        {
            using (progress.Scope(1, MessageFormat.ReorderingDictionary))
            {
                Func <IEnumerable <DictionaryMeaning>, string, IEnumerable <DictionaryMeaning> > meaningOrderFunc =
                    (enumerable, word) =>
                {
                    if (info.OrderMeanings == OrderMeanings.ByDictionary ||
                        info.OrderMeanings == OrderMeanings.ByTrainingSet)
                    {
                        enumerable = enumerable
                                     .OrderByDescending(z => z.Encounters);
                    }
                    else if (info.OrderMeanings == OrderMeanings.ByDictionaryAndTrainingSet)
                    {
                        enumerable = enumerable
                                     .OrderByDescending(z => z.Encounters)
                                     .ThenByDescending(z => project.DataAnalysis.GetByName(word)?
                                                       .TrainEncounters
                                                       .GetByName(z.Meaning)?.Encounters ?? 0);
                    }

                    return(enumerable.Select((z, i) => new DictionaryMeaning
                    {
                        Id = i + 1,
                        Meaning = z.Meaning,
                        PartOfSpeech = z.PartOfSpeech,
                        Encounters = z.Encounters
                    }));
                };

                var result = (info.OrderMeanings == OrderMeanings.ByTrainingSet
                        ? project.DataAnalysis
                              .Values
                              .Where(x => x.TrainEncounters.Any())
                              .Select(x => new DictionaryWord
                {
                    Id = project.Dictionary[x.Word].Id,
                    Word = x.Word,
                    Meanings = x.TrainEncounters.Values
                               .OrderByDescending(y => y.Encounters)
                               .Select((y, i) => new DictionaryMeaning
                    {
                        Id = i + 1,
                        Meaning = y.Meaning,
                        PartOfSpeech = y.PartOfSpeech,
                        Encounters = y.Encounters
                    }).ToMeaningDictionary()
                })
                        : project.Dictionary.Values)
                             .Select(x =>
                {
                    var meanings = (IEnumerable <DictionaryMeaning>)x.Meanings.Values;

                    if (info.OrderMeaningsStrategy == OrderMeaningsStrategy.GroupByWordAndPos)
                    {
                        meanings = meanings
                                   .GroupBy(y => y.PartOfSpeech)
                                   .SelectMany(y => meaningOrderFunc.Invoke(y, x.Word));
                    }
                    else
                    {
                        meanings = meaningOrderFunc.Invoke(meanings, x.Word);
                    }

                    return(new DictionaryWord
                    {
                        Id = x.Id,
                        Word = x.Word,
                        Meanings = meanings.ToMeaningDictionary()
                    });
                })
                             .Where(x => x.Meanings.Count > 0)
                             .ToWordDictionary();

                return(result);
            }
        }
Пример #16
0
        public IList <DataSetGroup> FormGroups(
            Dictionary <DataSetName, DataSetByText> dataSets, WsdProject project, GenerationInfo info,
            IProgressHandle progress)
        {
            var dataSetGroups = new Dictionary <string, DataSetGroup>();

            using (var scope = progress.Scope(dataSets.Count, MessageFormat.FormingGroups_DataSets))
            {
                var counter = 0;

                foreach (var dataSet in dataSets.Values)
                {
                    scope.TrySet(counter++);

                    IEnumerable <(string groupName, IEnumerable <RawRecord> data)> dataByGroup;

                    switch (info.SavingStrategy)
                    {
                    case SavingStrategy.SingleFile:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => string.Empty)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.FilePerWord:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => x.Word + "__" + project.Dictionary.GetByName(x.Word).Id)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.FilePerPos:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => x.Pos)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.FilePerWordAndPos:
                    {
                        dataByGroup = dataSet.Texts
                                      .SelectMany(x => x.Data)
                                      .GroupBy(x => x.Word + "__" + x.Pos + "__" + project.Dictionary.GetByName(x.Word).Id)
                                      .Select(x => (x.Key, (IEnumerable <RawRecord>)x));
                        break;
                    }

                    case SavingStrategy.OriginalFiles:
                    {
                        dataByGroup = dataSet.Texts
                                      .Select(x => (x.TextName, (IEnumerable <RawRecord>)x.Data));
                        break;
                    }

                    default:
                    {
                        throw new NotSupportedException(
                                  $"Saving stragegy {info.SavingStrategy} is not supported.");
                    }
                    }

                    foreach (var(groupName, data) in dataByGroup)
                    {
                        if (!dataSetGroups.ContainsKey(groupName))
                        {
                            dataSetGroups[groupName] = new DataSetGroup(groupName);
                        }

                        dataSetGroups[groupName].DataSets[dataSet.Name] =
                            new DataSet(dataSet.Name, data.ToArray());
                    }
                }

                return(dataSetGroups.Values.ToArray());
            }
        }
Пример #17
0
        public static TextData[] Read(
            string dataPath, string goldKeyPath, SynsetDictionary synsetMappings,
            WordDictionary dictionary, out XmlParseError[] errors,
            IProgressHandle progress = null)
        {
            var scope = progress?.Scope(1);

            try
            {
                var result         = new List <TextData>();
                var serializer     = new XmlSerializer(typeof(UefXmlData));
                var xmlParseErrors = new List <XmlParseError>();

                using (var reader = new StreamReader(dataPath))
                {
                    var goldKeys = File.ReadAllLines(goldKeyPath)
                                   .Where(x => !string.IsNullOrWhiteSpace(x))
                                   .Select(x => x.Trim(' ').Split(' '))
                                   .Where(x => x.Length > 1)
                                   .DistinctBy(x => x[0])
                                   .ToDictionary(x => x[0], x => string.Join(" ", x.Skip(1)));

                    var dataXml = (UefXmlData)serializer.Deserialize(reader);

                    foreach (var text in dataXml.Texts)
                    {
                        var encounters = new List <RawWordEncounter>();

                        foreach (var sentence in text.Sentences)
                        {
                            for (var i = 0; i < sentence.Encounters.Length; i++)
                            {
                                var encounter        = sentence.Encounters[i];
                                var encounterType    = sentence.EnumTypes[i];
                                var rawWordEncounter = new RawWordEncounter
                                {
                                    Word    = encounter.Lemma,
                                    Pos     = encounter.Pos,
                                    Meaning = string.Empty
                                };

                                if (encounterType == ItemChoiceType.instance)
                                {
                                    var status = SynsetHelper.TryGetMeaning(
                                        dictionary, goldKeys, synsetMappings,
                                        encounter.Lemma, encounter.Id, out var meaning);

                                    if (status == TryGetMeaningStatus.OK)
                                    {
                                        rawWordEncounter.Meaning = meaning;
                                    }
                                    else
                                    {
                                        xmlParseErrors.Add(new XmlParseError
                                        {
                                            EncounterId = encounter.Id,
                                            Error       = status
                                        });
                                    }
                                }

                                encounters.Add(rawWordEncounter);
                            }

                            encounters.Add(RawWordEncounter.EndOfSentenceEncounter);
                        }

                        result.Add(new TextData(text.Id, encounters.ToArray()));
                    }
                }

                errors = xmlParseErrors.ToArray();

                return(result.ToArray());
            }
            finally
            {
                scope?.Dispose();
            }
        }