private static WordsAndCounts RetrieveContextWordsForAllWordForms(string word, string langCode) { if (AllWordFormsContextCache.IsInCache(word, langCode)) { WordsAndCounts cachedContextWordsAndCounts = AllWordFormsContextCache.LoadFromCache(word, langCode); return(cachedContextWordsAndCounts); } // Retieve and merge the contexts of all word forms of the given word ICollection <string> allWordForms = LemmaDictionaryUtils.GetAllWordForms(word, langCode); WordsAndCounts contextWordsAndCounts = new WordsAndCounts(); foreach (string wordForm in allWordForms) { WordsAndCounts wordFormContextWordsAndCounts = RetrieveContextWords(wordForm, langCode); contextWordsAndCounts.AddAll(wordFormContextWordsAndCounts); } // Normalize the counts List <string> allContextWords = new List <string>(); allContextWords.AddRange(contextWordsAndCounts.Words); foreach (string contextWord in allContextWords) { double count = contextWordsAndCounts[contextWord]; contextWordsAndCounts[contextWord] = count / allWordForms.Count; } AllWordFormsContextCache.AddToCache(word, langCode, contextWordsAndCounts); return(contextWordsAndCounts); }
private List <string> GetLemmas(string word, string langCode) { List <string> lemmas; if (USE_LEMMATIZATION) { lemmas = LemmaDictionaryUtils.GetBasicForms(word, langCode); } else { lemmas = new List <string>(); lemmas.Add(word); } return(lemmas); }
public static ICollection <string> GetAllWordForms(string word, string langCode) { Dictionary <string, bool> allWordForms = new Dictionary <string, bool>(); List <string> basicForms = LemmaDictionaryUtils.GetBasicForms(word, langCode); foreach (string basicForm in basicForms) { List <string> wordForms = LemmaDictionaryUtils.GetNonBasicForms(basicForm, langCode); foreach (string wordForm in wordForms) { allWordForms[wordForm] = true; } } ICollection <string> allWordFormsList = allWordForms.Keys; return(allWordFormsList); }
public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode) { WordsAndCounts basicForms = new WordsAndCounts(); WordAndCount[] wordsAndCounts = words.AsSortedArray; foreach (WordAndCount word in wordsAndCounts) { List <string> basicFormWords = LemmaDictionaryUtils.GetBasicForms(word.Word, langCode); foreach (string basicWord in basicFormWords) { //double count = (double)word.Count / basicFormWords.Count; double count = (double)word.Count; basicForms.Add(basicWord, count); } } return(basicForms); }
private static double GetWordCountInContext(string word, String langCode, WordsAndCounts context) { if (useLemmatization) { double wordCount = 0; var allWordFormsCollection = LemmaDictionaryUtils.GetAllWordForms(word, langCode); foreach (string wordForm in allWordFormsCollection) { wordCount += context[wordForm]; } return(wordCount); } else { double wordCount = context[word]; return(wordCount); } }
private static Dictionary <string, bool> GetAllWordFormsDictionary( string word, string langCode) { Dictionary <string, bool> allWordFormsDict = new Dictionary <string, bool>(); if (useLemmatization) { var allWordFormsCollection = LemmaDictionaryUtils.GetAllWordForms(word, langCode); foreach (string wordForm in allWordFormsCollection) { allWordFormsDict.Add(wordForm, true); } } else { allWordFormsDict.Add(word, true); } return(allWordFormsDict); }
public static WordsAndCounts RetrieveContextWords(string word, string langCode) { if (ContextWordsCache.IsInCache(word, langCode)) { WordsAndCounts cachedContextWords = ContextWordsCache.LoadFromCache(word, langCode); return(cachedContextWords); } word = word.ToLowerInvariant(); WordsAndCounts contextWords = new WordsAndCounts(); List <string> titleSentences, textSentences; RetrieveWebContextSentences(word, langCode, out titleSentences, out textSentences); Dictionary <string, bool> allWordFormsDict = GetAllWordFormsDictionary(word, langCode); List <string> phrasesToKeep = new List <string>(); phrasesToKeep.AddRange(allWordFormsDict.Keys); foreach (string titleSentence in titleSentences) { ExtractWordsAround(titleSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep); } foreach (string textSentence in textSentences) { ExtractWordsAround(textSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep); } if (useLemmatization) { WordsAndCounts contextWordsBasicForms = LemmaDictionaryUtils.GetBasicForms(contextWords, langCode); ContextWordsCache.AddToCache(word, langCode, contextWordsBasicForms); return(contextWordsBasicForms); } else { ContextWordsCache.AddToCache(word, langCode, contextWords); return(contextWords); } }
public static double CalculateSimilarity(string bgWord, string ruWord) { // Prepare all forms of the Bulgarian word bgWord = bgWord.ToLowerInvariant(); HashSet <string> bgWordForms = new HashSet <string>(); bgWordForms.Add(bgWord); if (USE_LEMMATIZATION) { if (bgWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION && ruWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION) { List <string> bgLemmas = LemmaDictionaryUtils.GetBasicForms(bgWord, LANG_CODE_BG); foreach (string bgLemma in bgLemmas) { bgWordForms.Add(bgLemma); } } } // Prepare all forms of the Russian word ruWord = ruWord.ToLowerInvariant(); HashSet <string> ruWordForms = new HashSet <string>(); ruWordForms.Add(ruWord); string ruWordNoInflexion = TransformRussianInflextionToBulgarian(ruWord); ruWordForms.Add(ruWordNoInflexion); if (USE_LEMMATIZATION) { if (bgWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION && ruWord.Length >= MIN_WORD_LENGTH_FOR_LEMMATIZATION) { List <string> ruLemmas = LemmaDictionaryUtils.GetBasicForms(ruWord, LANG_CODE_RU); foreach (string ruLemma in ruLemmas) { ruWordForms.Add(ruLemma); string ruLemmaNoInflexion = TransformRussianInflextionToBulgarian(ruLemma); ruWordForms.Add(ruLemmaNoInflexion); } } } // Calculate MMEDR for each couple of Bulgarian and Russian wordform double maxMMEDR = 0; foreach (string bgWordForm in bgWordForms) { foreach (string ruWordForm in ruWordForms) { double mmedr = CalcMMEDR(bgWordForm, ruWordForm); if (mmedr > maxMMEDR) { maxMMEDR = mmedr; } } } return(maxMMEDR); }