private List <string> GetLemmas(string word, string langCode)
        {
            List <string> lemmas;

            if (USE_LEMMATIZATION)
            {
                lemmas = LemmaDictionaryUtils.GetBasicForms(word, langCode);
            }
            else
            {
                lemmas = new List <string>();
                lemmas.Add(word);
            }
            return(lemmas);
        }
        public static ICollection <string> GetAllWordForms(string word, string langCode)
        {
            Dictionary <string, bool> allWordForms = new Dictionary <string, bool>();
            List <string>             basicForms   = LemmaDictionaryUtils.GetBasicForms(word, langCode);

            foreach (string basicForm in basicForms)
            {
                List <string> wordForms =
                    LemmaDictionaryUtils.GetNonBasicForms(basicForm, langCode);
                foreach (string wordForm in wordForms)
                {
                    allWordForms[wordForm] = true;
                }
            }
            ICollection <string> allWordFormsList = allWordForms.Keys;

            return(allWordFormsList);
        }
        public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode)
        {
            WordsAndCounts basicForms = new WordsAndCounts();

            WordAndCount[] wordsAndCounts = words.AsSortedArray;
            foreach (WordAndCount word in wordsAndCounts)
            {
                List <string> basicFormWords =
                    LemmaDictionaryUtils.GetBasicForms(word.Word, langCode);
                foreach (string basicWord in basicFormWords)
                {
                    //double count = (double)word.Count / basicFormWords.Count;
                    double count = (double)word.Count;
                    basicForms.Add(basicWord, count);
                }
            }
            return(basicForms);
        }
Пример #4
0
        public static WordsAndCounts RetrieveContextWords(string word, string langCode)
        {
            if (ContextWordsCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWords =
                    ContextWordsCache.LoadFromCache(word, langCode);
                return(cachedContextWords);
            }

            word = word.ToLowerInvariant();
            WordsAndCounts contextWords = new WordsAndCounts();
            List <string>  titleSentences, textSentences;

            RetrieveWebContextSentences(word, langCode, out titleSentences, out textSentences);
            Dictionary <string, bool> allWordFormsDict = GetAllWordFormsDictionary(word, langCode);
            List <string>             phrasesToKeep    = new List <string>();

            phrasesToKeep.AddRange(allWordFormsDict.Keys);
            foreach (string titleSentence in titleSentences)
            {
                ExtractWordsAround(titleSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep);
            }
            foreach (string textSentence in textSentences)
            {
                ExtractWordsAround(textSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep);
            }

            if (useLemmatization)
            {
                WordsAndCounts contextWordsBasicForms =
                    LemmaDictionaryUtils.GetBasicForms(contextWords, langCode);
                ContextWordsCache.AddToCache(word, langCode, contextWordsBasicForms);
                return(contextWordsBasicForms);
            }
            else
            {
                ContextWordsCache.AddToCache(word, langCode, contextWords);
                return(contextWords);
            }
        }
        public static double CalculateSimilarity(string bgWord, string ruWord)
        {
            // Prepare all forms of the Bulgarian word
            bgWord = bgWord.ToLowerInvariant();
            HashSet <string> bgWordForms = new HashSet <string>();

            bgWordForms.Add(bgWord);

            if (USE_LEMMATIZATION)
            {
                if (bgWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION &&
                    ruWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION)
                {
                    List <string> bgLemmas = LemmaDictionaryUtils.GetBasicForms(bgWord, LANG_CODE_BG);
                    foreach (string bgLemma in bgLemmas)
                    {
                        bgWordForms.Add(bgLemma);
                    }
                }
            }

            // Prepare all forms of the Russian word
            ruWord = ruWord.ToLowerInvariant();
            HashSet <string> ruWordForms = new HashSet <string>();

            ruWordForms.Add(ruWord);
            string ruWordNoInflexion = TransformRussianInflextionToBulgarian(ruWord);

            ruWordForms.Add(ruWordNoInflexion);

            if (USE_LEMMATIZATION)
            {
                if (bgWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION &&
                    ruWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION)
                {
                    List <string> ruLemmas = LemmaDictionaryUtils.GetBasicForms(ruWord, LANG_CODE_RU);
                    foreach (string ruLemma in ruLemmas)
                    {
                        ruWordForms.Add(ruLemma);
                        string ruLemmaNoInflexion = TransformRussianInflextionToBulgarian(ruLemma);
                        ruWordForms.Add(ruLemmaNoInflexion);
                    }
                }
            }

            // Calculate MMEDR for each couple of Bulgarian and Russian wordform
            double maxMMEDR = 0;

            foreach (string bgWordForm in bgWordForms)
            {
                foreach (string ruWordForm in ruWordForms)
                {
                    double mmedr = CalcMMEDR(bgWordForm, ruWordForm);
                    if (mmedr > maxMMEDR)
                    {
                        maxMMEDR = mmedr;
                    }
                }
            }

            return(maxMMEDR);
        }