示例#1
0
        private static WordsAndCounts RetrieveContextWordsForAllWordForms(string word, string langCode)
        {
            if (AllWordFormsContextCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWordsAndCounts =
                    AllWordFormsContextCache.LoadFromCache(word, langCode);
                return(cachedContextWordsAndCounts);
            }

            // Retieve and merge the contexts of all word forms of the given word
            ICollection <string> allWordForms          = LemmaDictionaryUtils.GetAllWordForms(word, langCode);
            WordsAndCounts       contextWordsAndCounts = new WordsAndCounts();

            foreach (string wordForm in allWordForms)
            {
                WordsAndCounts wordFormContextWordsAndCounts =
                    RetrieveContextWords(wordForm, langCode);
                contextWordsAndCounts.AddAll(wordFormContextWordsAndCounts);
            }

            // Normalize the counts
            List <string> allContextWords = new List <string>();

            allContextWords.AddRange(contextWordsAndCounts.Words);
            foreach (string contextWord in allContextWords)
            {
                double count = contextWordsAndCounts[contextWord];
                contextWordsAndCounts[contextWord] = count / allWordForms.Count;
            }

            AllWordFormsContextCache.AddToCache(word, langCode, contextWordsAndCounts);

            return(contextWordsAndCounts);
        }
        private List <string> GetLemmas(string word, string langCode)
        {
            List <string> lemmas;

            if (USE_LEMMATIZATION)
            {
                lemmas = LemmaDictionaryUtils.GetBasicForms(word, langCode);
            }
            else
            {
                lemmas = new List <string>();
                lemmas.Add(word);
            }
            return(lemmas);
        }
        public static ICollection <string> GetAllWordForms(string word, string langCode)
        {
            Dictionary <string, bool> allWordForms = new Dictionary <string, bool>();
            List <string>             basicForms   = LemmaDictionaryUtils.GetBasicForms(word, langCode);

            foreach (string basicForm in basicForms)
            {
                List <string> wordForms =
                    LemmaDictionaryUtils.GetNonBasicForms(basicForm, langCode);
                foreach (string wordForm in wordForms)
                {
                    allWordForms[wordForm] = true;
                }
            }
            ICollection <string> allWordFormsList = allWordForms.Keys;

            return(allWordFormsList);
        }
        public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode)
        {
            WordsAndCounts basicForms = new WordsAndCounts();

            WordAndCount[] wordsAndCounts = words.AsSortedArray;
            foreach (WordAndCount word in wordsAndCounts)
            {
                List <string> basicFormWords =
                    LemmaDictionaryUtils.GetBasicForms(word.Word, langCode);
                foreach (string basicWord in basicFormWords)
                {
                    //double count = (double)word.Count / basicFormWords.Count;
                    double count = (double)word.Count;
                    basicForms.Add(basicWord, count);
                }
            }
            return(basicForms);
        }
示例#5
0
 private static double GetWordCountInContext(string word, String langCode, WordsAndCounts context)
 {
     if (useLemmatization)
     {
         double wordCount = 0;
         var    allWordFormsCollection =
             LemmaDictionaryUtils.GetAllWordForms(word, langCode);
         foreach (string wordForm in allWordFormsCollection)
         {
             wordCount += context[wordForm];
         }
         return(wordCount);
     }
     else
     {
         double wordCount = context[word];
         return(wordCount);
     }
 }
示例#6
0
        private static Dictionary <string, bool> GetAllWordFormsDictionary(
            string word, string langCode)
        {
            Dictionary <string, bool> allWordFormsDict = new Dictionary <string, bool>();

            if (useLemmatization)
            {
                var allWordFormsCollection =
                    LemmaDictionaryUtils.GetAllWordForms(word, langCode);
                foreach (string wordForm in allWordFormsCollection)
                {
                    allWordFormsDict.Add(wordForm, true);
                }
            }
            else
            {
                allWordFormsDict.Add(word, true);
            }
            return(allWordFormsDict);
        }
示例#7
0
        public static WordsAndCounts RetrieveContextWords(string word, string langCode)
        {
            if (ContextWordsCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWords =
                    ContextWordsCache.LoadFromCache(word, langCode);
                return(cachedContextWords);
            }

            word = word.ToLowerInvariant();
            WordsAndCounts contextWords = new WordsAndCounts();
            List <string>  titleSentences, textSentences;

            RetrieveWebContextSentences(word, langCode, out titleSentences, out textSentences);
            Dictionary <string, bool> allWordFormsDict = GetAllWordFormsDictionary(word, langCode);
            List <string>             phrasesToKeep    = new List <string>();

            phrasesToKeep.AddRange(allWordFormsDict.Keys);
            foreach (string titleSentence in titleSentences)
            {
                ExtractWordsAround(titleSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep);
            }
            foreach (string textSentence in textSentences)
            {
                ExtractWordsAround(textSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep);
            }

            if (useLemmatization)
            {
                WordsAndCounts contextWordsBasicForms =
                    LemmaDictionaryUtils.GetBasicForms(contextWords, langCode);
                ContextWordsCache.AddToCache(word, langCode, contextWordsBasicForms);
                return(contextWordsBasicForms);
            }
            else
            {
                ContextWordsCache.AddToCache(word, langCode, contextWords);
                return(contextWords);
            }
        }
        public static double CalculateSimilarity(string bgWord, string ruWord)
        {
            // Prepare all forms of the Bulgarian word
            bgWord = bgWord.ToLowerInvariant();
            HashSet <string> bgWordForms = new HashSet <string>();

            bgWordForms.Add(bgWord);

            if (USE_LEMMATIZATION)
            {
                if (bgWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION &&
                    ruWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION)
                {
                    List <string> bgLemmas = LemmaDictionaryUtils.GetBasicForms(bgWord, LANG_CODE_BG);
                    foreach (string bgLemma in bgLemmas)
                    {
                        bgWordForms.Add(bgLemma);
                    }
                }
            }

            // Prepare all forms of the Russian word
            ruWord = ruWord.ToLowerInvariant();
            HashSet <string> ruWordForms = new HashSet <string>();

            ruWordForms.Add(ruWord);
            string ruWordNoInflexion = TransformRussianInflextionToBulgarian(ruWord);

            ruWordForms.Add(ruWordNoInflexion);

            if (USE_LEMMATIZATION)
            {
                if (bgWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION &&
                    ruWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION)
                {
                    List <string> ruLemmas = LemmaDictionaryUtils.GetBasicForms(ruWord, LANG_CODE_RU);
                    foreach (string ruLemma in ruLemmas)
                    {
                        ruWordForms.Add(ruLemma);
                        string ruLemmaNoInflexion = TransformRussianInflextionToBulgarian(ruLemma);
                        ruWordForms.Add(ruLemmaNoInflexion);
                    }
                }
            }

            // Calculate MMEDR for each couple of Bulgarian and Russian wordform
            double maxMMEDR = 0;

            foreach (string bgWordForm in bgWordForms)
            {
                foreach (string ruWordForm in ruWordForms)
                {
                    double mmedr = CalcMMEDR(bgWordForm, ruWordForm);
                    if (mmedr > maxMMEDR)
                    {
                        maxMMEDR = mmedr;
                    }
                }
            }

            return(maxMMEDR);
        }