Example #1
0
        public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
        {
            context.Project.PluginData
            .GetData <LoggingPlugin, UsageStatistics>(string.Empty)
            .WordElementCounter++;

            return(new[] { new FeatureValue(encounter.Word) });
        }
Example #2
0
 public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
 {
     return(new[]
     {
         new FeatureValue(
             !string.IsNullOrEmpty(encounter.Meaning)
                 ? encounter.Meaning
                 : "unknown")
     });
 }
Example #3
0
 public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
 {
     return(new[]
     {
         new FeatureValue(
             context.ReorderedDictionary
             .GetByName(encounter.Word)?.Meanings
             .GetByName(encounter.Meaning)?.Id ?? 0)
     });
 }
        private WordAnalysis GetOrAdd(WordDictionary dictionary, RawWordEncounter encounter)
        {
            var dictionaryWord = dictionary.GetByName(encounter.Word);
            var wordAnalysis   = GetByName(encounter.Word);

            if (wordAnalysis == null)
            {
                wordAnalysis = new WordAnalysis
                {
                    Id   = dictionaryWord?.Id ?? -1,
                    Word = encounter.Word
                };

                Add(encounter.Word, wordAnalysis);
            }

            return(wordAnalysis);
        }
Example #5
0
        public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
        {
            if (context.DataSetName == DataSetName.Train ||
                context.DataSetName == DataSetName.Validation)
            {
                var embedding = context.Project.MeaningEmbeddings.GetVectorOrDefault(encounter.Meaning);

                return(FeatureValue.NewArray(embedding));
            }

            var meanings = context.ReorderedDictionary
                           .GetByName(encounter.Word)?
                           .Meanings.Values
                           .Where(x => x.PartOfSpeech == encounter.Pos)
                           .ToArray();

            if (meanings == null || meanings.Length == 0)
            {
                return(FeatureValue.NewArray(0, context.Project.MeaningEmbeddings.VectorLength));
            }

            var averageEmbedding = new float[context.Project.MeaningEmbeddings.VectorLength];
            var divisor          = 0;

            for (var m = 0; m < meanings.Length; m++)
            {
                var embedding = context.Project.MeaningEmbeddings.GetVectorOrDefault(meanings[m].Meaning);

                for (var i = 0; i < averageEmbedding.Length; i++)
                {
                    averageEmbedding[i] += embedding[i] * (meanings[m].Encounters + 1);
                    divisor             += meanings[m].Encounters + 1;
                }
            }

            for (var i = 0; i < averageEmbedding.Length; i++)
            {
                averageEmbedding[i] /= divisor;
            }

            return(FeatureValue.NewArray(averageEmbedding));
        }
        public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
        {
            if (context.DataSetName == DataSetName.Train ||
                context.DataSetName == DataSetName.Validation)
            {
                var embedding = context.Project.MeaningEmbeddings.GetVectorOrDefault(encounter.Meaning);

                return(FeatureValue.NewArray(embedding));
            }

            var mostFrequentMeaning = context.ReorderedDictionary
                                      .GetByName(encounter.Word)?
                                      .Meanings.Values
                                      .SingleOrDefault(x => x.Id == 1 && x.PartOfSpeech == encounter.Pos)?
                                      .Meaning;

            var mostFrequentEmbedding = context.Project.MeaningEmbeddings
                                        .GetVectorOrDefault(mostFrequentMeaning);

            return(FeatureValue.NewArray(mostFrequentEmbedding));
        }
        internal void AddEncounter(WordDictionary dictionary, RawWordEncounter encounter)
        {
            var dictionaryWord    = dictionary.GetByName(encounter.Word);
            var dictionaryMeaning = dictionaryWord?.Meanings.GetByName(encounter.Meaning);

            var meaningAnalysis = GetByName(encounter.Meaning);

            if (meaningAnalysis == null)
            {
                meaningAnalysis = new DictionaryMeaning
                {
                    Id           = dictionaryMeaning?.Id ?? -1,
                    Meaning      = encounter.Meaning,
                    PartOfSpeech = SynsetHelper.GetPos(encounter.Meaning)
                };

                Add(encounter.Meaning, meaningAnalysis);
            }

            meaningAnalysis.Encounters++;
        }
Example #8
0
        public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
        {
            var posVector = context.FilteredPosList.GetVector(encounter.Pos);

            return(FeatureValue.NewArray(posVector));
        }
Example #9
0
        private IList <RawRecord> GenerateRecords(
            IList <RawWordEncounter> input, WsdProject project, GenerationInfo info)
        {
            var contextWindowLength = info.LeftContext + 1 + info.RightContext;
            var wordIndexInContext  = info.LeftContext;
            var contextWindow       = new RawWordEncounter[contextWindowLength];
            var records             = new List <RawRecord>();

            contextWindow.ShiftLeft(new RawWordEncounter
            {
                Word = RawWordEncounter.EndOfSentence
            });

            using (var enumerator = input.GetEnumerator())
            {
                bool moveNext;

                do
                {
                    moveNext = enumerator.MoveNext();

                    if (moveNext)
                    {
                        if (!string.IsNullOrEmpty(enumerator.Current.Pos) &&
                            !info.FilteredPosList.Contains(enumerator.Current.Pos))
                        {
                            continue;
                        }

                        contextWindow.ShiftLeft(enumerator.Current);
                    }
                    else
                    {
                        contextWindow.ShiftLeft();
                    }

                    var currentEncounter = contextWindow[wordIndexInContext];

                    if (currentEncounter == null ||
                        currentEncounter.Word == RawWordEncounter.EmptyWord ||
                        currentEncounter.Word == RawWordEncounter.EndOfSentence ||
                        string.IsNullOrWhiteSpace(currentEncounter.Meaning))
                    {
                        continue;
                    }

                    var dictionaryWord = project.Dictionary.GetByName(currentEncounter.Word);

                    if (dictionaryWord == null || dictionaryWord.Meanings.Count <= 1)
                    {
                        continue;
                    }

                    var context = new RawWordEncounter[contextWindowLength - 1];

                    if (info.Overlap)
                    {
                        for (var i = 0; i < contextWindowLength; i++)
                        {
                            if (i == wordIndexInContext)
                            {
                                continue;
                            }

                            var indexInBuffer = i < wordIndexInContext ? i : i - 1;

                            context[indexInBuffer] = contextWindow[i] ?? RawWordEncounter.EmptyWordEncounter;
                        }
                    }
                    else
                    {
                        var endOfSentence = false;

                        for (var i = wordIndexInContext - 1; i >= 0; i--)
                        {
                            context[i] = endOfSentence
                                ? RawWordEncounter.EmptyWordEncounter
                                : contextWindow[i] ?? RawWordEncounter.EmptyWordEncounter;

                            if (contextWindow[i]?.Word == RawWordEncounter.EndOfSentence)
                            {
                                endOfSentence = true;
                            }
                        }

                        endOfSentence = false;

                        for (var i = wordIndexInContext + 1; i < contextWindowLength; i++)
                        {
                            context[i - 1] = endOfSentence
                                ? RawWordEncounter.EmptyWordEncounter
                                : contextWindow[i] ?? RawWordEncounter.EmptyWordEncounter;

                            if (contextWindow[i]?.Word == RawWordEncounter.EndOfSentence)
                            {
                                endOfSentence = true;
                            }
                        }
                    }

                    records.Add(new RawRecord
                    {
                        Word    = currentEncounter.Word,
                        Meaning = currentEncounter.Meaning,
                        Pos     = currentEncounter.Pos,
                        Context = context
                    });
                } while (moveNext || !contextWindow.IsEmpty());
            }

            return(records);
        }
Example #10
0
 public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
 {
     return(new[] { new FeatureValue(context.FilteredPosList.GetOrDefault(encounter.Pos)) });
 }
Example #11
0
 public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
 {
     return(new[] { new FeatureValue(encounter.Word) });
 }
Example #12
0
        public IList <FeatureValue> GetValues(RawWordEncounter encounter, FeatureSelectionContext context)
        {
            var embedding = context.Project.MeaningEmbeddings.GetVectorOrDefault(encounter.Meaning);

            return(FeatureValue.NewArray(embedding));
        }
Example #13
0
        public static TextData[] Read(
            string dataPath, string goldKeyPath, SynsetDictionary synsetMappings,
            WordDictionary dictionary, out XmlParseError[] errors,
            IProgressHandle progress = null)
        {
            var scope = progress?.Scope(1);

            try
            {
                var result         = new List <TextData>();
                var serializer     = new XmlSerializer(typeof(UefXmlData));
                var xmlParseErrors = new List <XmlParseError>();

                using (var reader = new StreamReader(dataPath))
                {
                    var goldKeys = File.ReadAllLines(goldKeyPath)
                                   .Where(x => !string.IsNullOrWhiteSpace(x))
                                   .Select(x => x.Trim(' ').Split(' '))
                                   .Where(x => x.Length > 1)
                                   .DistinctBy(x => x[0])
                                   .ToDictionary(x => x[0], x => string.Join(" ", x.Skip(1)));

                    var dataXml = (UefXmlData)serializer.Deserialize(reader);

                    foreach (var text in dataXml.Texts)
                    {
                        var encounters = new List <RawWordEncounter>();

                        foreach (var sentence in text.Sentences)
                        {
                            for (var i = 0; i < sentence.Encounters.Length; i++)
                            {
                                var encounter        = sentence.Encounters[i];
                                var encounterType    = sentence.EnumTypes[i];
                                var rawWordEncounter = new RawWordEncounter
                                {
                                    Word    = encounter.Lemma,
                                    Pos     = encounter.Pos,
                                    Meaning = string.Empty
                                };

                                if (encounterType == ItemChoiceType.instance)
                                {
                                    var status = SynsetHelper.TryGetMeaning(
                                        dictionary, goldKeys, synsetMappings,
                                        encounter.Lemma, encounter.Id, out var meaning);

                                    if (status == TryGetMeaningStatus.OK)
                                    {
                                        rawWordEncounter.Meaning = meaning;
                                    }
                                    else
                                    {
                                        xmlParseErrors.Add(new XmlParseError
                                        {
                                            EncounterId = encounter.Id,
                                            Error       = status
                                        });
                                    }
                                }

                                encounters.Add(rawWordEncounter);
                            }

                            encounters.Add(RawWordEncounter.EndOfSentenceEncounter);
                        }

                        result.Add(new TextData(text.Id, encounters.ToArray()));
                    }
                }

                errors = xmlParseErrors.ToArray();

                return(result.ToArray());
            }
            finally
            {
                scope?.Dispose();
            }
        }
Example #14
0
 public EncounterValues(RawWordEncounter encounter, IList <FeatureValue> values)
 {
     Encounter = encounter;
     Values    = values;
 }