Ejemplo n.º 1
0
        /// <summary>
        /// Create or modify a lexeme with no word form basis, gets tricky with best fit scenarios
        /// </summary>
        /// <param name="word">just the text of the word</param>
        /// <returns>A lexeme</returns>
        public static ILexeme CreateOrModifyLexeme(ILanguage language, string word, LexicalType wordType, ref List <string> processedWords)
        {
            word = word.ToLower();

            Regex rgx = new Regex("[^a-z -]");

            if (rgx.IsMatch(word))
            {
                return(null);
            }

            ILexeme newLex = ConfigDataCache.Get <ILexeme>(string.Format("{0}_{1}_{2}", ConfigDataType.Dictionary, language.Name, word));

            if (newLex == null)
            {
                newLex = language.CreateOrModifyLexeme(word, wordType, new string[0]);
            }

            if ((newLex.IsSynMapped && newLex.MirriamIndexed) || processedWords.Any(wrd => wrd.Equals(word)))
            {
                if (!processedWords.Any(wrd => wrd.Equals(word)))
                {
                    processedWords.Add(word);
                }
            }
            else
            {
                LexicalType[] invalidTypes = new LexicalType[] { LexicalType.Article, LexicalType.Conjunction, LexicalType.ProperNoun, LexicalType.Pronoun, LexicalType.None };

                processedWords.Add(word);

                //This is wordnet processing, wordnet doesnt have any of the above and will return weird results if we let it
                if (!invalidTypes.Contains(wordType))
                {
                    var synSets = WordNetHarness.GetSynSets(word, new PartOfSpeech[] { PartOfSpeech.Adjective, PartOfSpeech.Adverb, PartOfSpeech.Noun, PartOfSpeech.Verb });

                    //We in theory have every single word form for this word now
                    if (synSets != null)
                    {
                        SemanticContext[] invalidContexts = new SemanticContext[]
                        { SemanticContext.Group, SemanticContext.Event, SemanticContext.Location, SemanticContext.Competition, SemanticContext.Person
                          , SemanticContext.Plant, SemanticContext.Animal, SemanticContext.Time, SemanticContext.Artifact };

                        foreach (SynSet synSet in synSets)
                        {
                            if (synSet.PartOfSpeech == PartOfSpeech.None)
                            {
                                continue;
                            }

                            var synContext = TranslateContext(synSet.LexicographerFileName);

                            if (invalidContexts.Contains(synContext))
                            {
                                continue;
                            }

                            var newDict = newLex.GetForm(MapLexicalTypes(synSet.PartOfSpeech), -1);

                            if (newDict == null)
                            {
                                newLex          = language.CreateOrModifyLexeme(word, MapLexicalTypes(synSet.PartOfSpeech), new string[0]);
                                newDict         = newLex.GetForm(MapLexicalTypes(synSet.PartOfSpeech), -1);
                                newDict.Context = TranslateContext(synSet.LexicographerFileName);
                            }

                            //We're going to use the definition from here
                            if (!string.IsNullOrWhiteSpace(synSet.Gloss))
                            {
                                newDict.Definition = synSet.Gloss;
                            }

                            ///wsns indicates hypo/hypernymity so
                            foreach (string synWord in synSet.Words)
                            {
                                var newWord = synWord.ToLower();
                                newWord = newWord.Replace("_", " ");

                                if (rgx.IsMatch(newWord))
                                {
                                    continue;
                                }

                                int myElegance = Math.Max(0, newWord.SyllableCount() * 3);

                                processedWords.Add(newWord);

                                if (string.IsNullOrWhiteSpace(newWord) || newWord.All(ch => ch == '-') || newWord.IsNumeric())
                                {
                                    continue;
                                }

                                var synLex = language.CreateOrModifyLexeme(newWord, MapLexicalTypes(synSet.PartOfSpeech), newDict.Semantics.ToArray());

                                var synDict = synLex.GetForm(MapLexicalTypes(synSet.PartOfSpeech), newDict.Semantics.ToArray(), false);
                                synDict.Elegance   = 0;
                                synDict.Quality    = 0;
                                synDict.Severity   = 0;
                                synDict.Context    = synContext;
                                synDict.Definition = newDict.Definition;

                                synLex.PersistToCache();
                                synLex.SystemSave();

                                if (!newWord.Equals(word))
                                {
                                    newDict.MakeRelatedWord(language, newWord, true, synDict);
                                }
                            }
                        }
                    }
                }

                newLex.IsSynMapped = true;
                newLex.SystemSave();
                newLex.PersistToCache();
            }

            if (!newLex.MirriamIndexed)
            {
                var newDict = newLex.GetForm(0);

                try
                {
                    var dictEntry = MirriamWebsterAPI.GetDictionaryEntry(newLex.Name);
                    if (dictEntry != null)
                    {
                        //Stuff done to modify all forms of the lexeme
                        foreach (var dict in newLex.WordForms)
                        {
                            dict.Vulgar = dictEntry.meta.offensive;
                        }

                        //Stuff done based on the dictionary return data
                        foreach (var stemWord in dictEntry.uros)
                        {
                            if (newLex.GetForm(MapLexicalTypes(stemWord.fl)) == null)
                            {
                                var     wordText = stemWord.ure.Replace("*", "");
                                ILexeme stemLex  = ConfigDataCache.Get <ILexeme>(string.Format("{0}_{1}_{2}", ConfigDataType.Dictionary, language.Name, wordText));

                                if (stemLex == null)
                                {
                                    stemLex = language.CreateOrModifyLexeme(wordText, MapLexicalTypes(stemWord.fl), null);

                                    var stemDict = stemLex.GetForm(0);
                                    stemDict.Elegance   = newDict.Elegance;
                                    stemDict.Quality    = newDict.Quality;
                                    stemDict.Severity   = newDict.Severity;
                                    stemDict.Context    = newDict.Context;
                                    stemDict.Definition = newDict.Definition;
                                    stemDict.Semantics  = newDict.Semantics;
                                    processedWords.Add(wordText);

                                    stemLex.SystemSave();
                                    stemLex.PersistToCache();
                                }
                            }
                        }

                        newDict.Semantics = new HashSet <string>(dictEntry.sls);
                    }
                }
                catch
                {
                    //just eating it
                }

                try
                {
                    var thesEntry = MirriamWebsterAPI.GetThesaurusEntry(newLex.Name);
                    if (thesEntry != null)
                    {
                        foreach (var synonym in thesEntry.meta.syns.SelectMany(syn => syn))
                        {
                            var newWord = synonym.ToLower();
                            newWord = newWord.Replace("_", " ");

                            if (rgx.IsMatch(newWord) || string.IsNullOrWhiteSpace(newWord) || newWord.All(ch => ch == '-'))
                            {
                                continue;
                            }

                            var synLex = language.CreateOrModifyLexeme(newWord, MapLexicalTypes(thesEntry.fl), newDict.Semantics.ToArray());

                            var synDict = synLex.GetForm(MapLexicalTypes(thesEntry.fl), newDict.Semantics.ToArray(), false);
                            synDict.Elegance   = 0;
                            synDict.Quality    = 0;
                            synDict.Severity   = 0;
                            synDict.Context    = newDict.Context;
                            synDict.Definition = newDict.Definition;

                            synLex.PersistToCache();
                            synLex.SystemSave();
                            processedWords.Add(newWord);

                            if (!newWord.Equals(word))
                            {
                                newDict.MakeRelatedWord(language, newWord, true, synDict);
                            }
                        }

                        foreach (var antonym in thesEntry.meta.ants.SelectMany(syn => syn))
                        {
                            var newWord = antonym.ToLower();
                            newWord = newWord.Replace("_", " ");

                            if (rgx.IsMatch(newWord) || string.IsNullOrWhiteSpace(newWord) || newWord.All(ch => ch == '-'))
                            {
                                continue;
                            }

                            var synLex = language.CreateOrModifyLexeme(newWord, MapLexicalTypes(thesEntry.fl), newDict.Semantics.ToArray());

                            var synDict = synLex.GetForm(MapLexicalTypes(thesEntry.fl), newDict.Semantics.ToArray(), false);
                            synDict.Elegance   = 0;
                            synDict.Quality    = 0;
                            synDict.Severity   = 0;
                            synDict.Context    = newDict.Context;
                            synDict.Definition = newDict.Definition;

                            synLex.PersistToCache();
                            synLex.SystemSave();
                            processedWords.Add(newWord);

                            if (!newWord.Equals(word))
                            {
                                newDict.MakeRelatedWord(language, newWord, false, synDict);
                            }
                        }
                    }
                }
                catch
                {
                    //just eating it
                }

                newLex.MirriamIndexed = true;
                newLex.SystemSave();
                newLex.PersistToCache();
            }

            if (!newLex.IsTranslated)
            {
            }

            return(newLex);
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Create or modify a lexeme with no word form basis, gets tricky with best fit scenarios
        /// </summary>
        /// <param name="word">just the text of the word</param>
        /// <returns>A lexeme</returns>
        public static ILexeme CreateOrModifyLexeme(ILanguage language, string word, ref List <string> processedWords)
        {
            word = word.ToLower();

            Regex rgx = new Regex("[^a-z -]");

            word = rgx.Replace(word, "");

            if (string.IsNullOrWhiteSpace(word) || word.All(ch => ch == '-'))
            {
                return(null);
            }

            ILexeme newLex = ConfigDataCache.Get <ILexeme>(string.Format("{0}_{1}", language.Name, word));

            if (newLex == null)
            {
                newLex = language.CreateOrModifyLexeme(word, LexicalType.None, new string[0]);
            }

            if (newLex.IsSynMapped || processedWords.Any(wrd => wrd.Equals(word)))
            {
                if (!processedWords.Any(wrd => wrd.Equals(word)))
                {
                    processedWords.Add(word);
                }

                return(newLex);
            }

            processedWords.Add(word);

            bool          exists    = true;
            SearchSet     searchSet = null;
            List <Search> results   = new List <Search>();

            WordNet.OverviewFor(word, string.Empty, ref exists, ref searchSet, results);

            //We in theory have every single word form for this word now
            if (exists && results != null)
            {
                LexicalType[] invalidTypes = new LexicalType[] { LexicalType.Article, LexicalType.Conjunction, LexicalType.ProperNoun, LexicalType.Pronoun, LexicalType.None };

                foreach (SynonymSet synSet in results.SelectMany(result => result.senses))
                {
                    //grab semantics somehow
                    List <string> semantics  = new List <string>();
                    var           indexSplit = synSet.defn.IndexOf(';');
                    string        definition = synSet.defn.Substring(0, indexSplit < 0 ? synSet.defn.Length - 1 : indexSplit).Trim();
                    string[]      defWords   = definition.Split(' ');

                    foreach (string defWord in defWords)
                    {
                        var currentWord = defWord.ToLower();
                        currentWord = rgx.Replace(currentWord, "");

                        if (currentWord.Equals(word) || string.IsNullOrWhiteSpace(word) || word.All(ch => ch == '-') || word.IsNumeric())
                        {
                            continue;
                        }

                        var defLex = language.CreateOrModifyLexeme(currentWord, LexicalType.None, new string[0]);

                        if (defLex != null && !defLex.ContainedTypes().Any(typ => invalidTypes.Contains(typ)))
                        {
                            semantics.Add(currentWord);
                        }
                    }

                    var type = MapLexicalTypes(synSet.pos.Flag);
                    newLex = language.CreateOrModifyLexeme(word, type, semantics.ToArray());
                    var newDict = newLex.GetForm(type, semantics.ToArray(), false);

                    ///wsns indicates hypo/hypernymity so
                    int baseWeight = synSet.words[Math.Max(0, synSet.whichword - 1)].wnsns;
                    newDict.Severity = baseWeight;
                    newDict.Elegance = Math.Max(0, newDict.Name.SyllableCount() * 3);
                    newDict.Quality  = synSet.words.Count();

                    foreach (Lexeme synWord in synSet.words)
                    {
                        ///wsns indicates hypo/hypernymity so
                        int mySeverity = synWord.wnsns;
                        int myElegance = Math.Max(0, synWord.word.SyllableCount() * 3);
                        int myQuality  = synWord.semcor?.semcor ?? 0;

                        //Don't bother if this word is already the same word we started with
                        if (synWord.word.Equals(newDict.Name, StringComparison.InvariantCultureIgnoreCase))
                        {
                            continue;
                        }

                        //it's a phrase
                        if (synWord.word.Contains("_"))
                        {
                            string[] words = synWord.word.Split('_');

                            //foreach (string phraseWord in words)
                            //{
                            //    //make the phrase? maybe later
                            //}
                        }
                        else
                        {
                            var newWord = synWord.word.ToLower();
                            newWord = rgx.Replace(newWord, "");

                            if (newWord.Equals(word) || string.IsNullOrWhiteSpace(newWord) || newWord.All(ch => ch == '-') || newWord.IsNumeric())
                            {
                                continue;
                            }

                            newDict.MakeRelatedWord(language, synWord.word, true);
                        }
                    }
                }
            }

            newLex.IsSynMapped = true;
            newLex.SystemSave();
            newLex.PersistToCache();

            return(newLex);
        }