private static double[] CalculateBgDictionaryContextVector(string bgWord) { // First check the vectors cache if (VectorsCache.IsInCache(bgWord, LANG_CODE_BG)) { double[] bgVectorFromCache = VectorsCache.LoadFromCache(bgWord, LANG_CODE_BG); return(bgVectorFromCache); } // Retrieve the word's local context WordsAndCounts bgWordContext; if (useIndirectContext) { bgWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithIndirectContext(bgWord, LANG_CODE_BG); } else if (useQueryLemmatization) { bgWordContext = SemanticSimilarityUtils. RetrieveContextWordsForAllWordForms(bgWord, LANG_CODE_BG); } else { bgWordContext = SemanticSimilarityUtils. RetrieveContextWords(bgWord, LANG_CODE_BG); } if (useTFIDF) { ApplyTFIDFWeighting(bgWordContext, LANG_CODE_BG); } // Analyse the word's local context and match the dictionary words in it string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords; double[] bgVector = new double[dictionaryBgWords.Length]; for (int i = 0; i < dictionaryBgWords.Length; i++) { string bgDictWord = dictionaryBgWords[i]; double bgDictWordCount = bgWordContext[bgDictWord]; bgVector[i] = bgDictWordCount; } if (useReverseContext) { // Reverse match the context vector with the dictionary word's contexts for (int i = 0; i < dictionaryBgWords.Length; i++) { double bgWordForwardCount = bgVector[i]; if (bgWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext) { string bgDictWord = dictionaryBgWords[i]; WordsAndCounts bgDictWordReverseContext = SemanticSimilarityUtils.RetrieveContextWords(bgDictWord, LANG_CODE_BG); double bgWordReverseCount = GetWordCountInContext(bgWord, LANG_CODE_BG, bgDictWordReverseContext); bgVector[i] = Math.Min(bgWordForwardCount, bgWordReverseCount); } else { bgVector[i] = 0; } } } // Add the calculated context vector to the cache VectorsCache.AddToCache(bgWord, LANG_CODE_BG, bgVector); return(bgVector); }
private static double[] CalculateRuDictionaryContextVector(string ruWord) { // First check the vectors cache if (VectorsCache.IsInCache(ruWord, LANG_CODE_RU)) { double[] ruVectorFromCache = VectorsCache.LoadFromCache(ruWord, LANG_CODE_RU); return(ruVectorFromCache); } // Retrieve the word's local context WordsAndCounts ruWordContext; if (useIndirectContext) { ruWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithIndirectContext(ruWord, LANG_CODE_RU); } else if (useQueryLemmatization) { ruWordContext = SemanticSimilarityUtils. RetrieveContextWordsForAllWordForms(ruWord, LANG_CODE_RU); } else { ruWordContext = SemanticSimilarityUtils. RetrieveContextWords(ruWord, LANG_CODE_RU); } if (useTFIDF) { ApplyTFIDFWeighting(ruWordContext, LANG_CODE_RU); } // Analyse the word's local context and match the dictionary words in it string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords; double[] ruVector = new double[dictionaryBgWords.Length]; for (int i = 0; i < dictionaryBgWords.Length; i++) { string bgDictWord = dictionaryBgWords[i]; List <string> ruDictWords = BgRuDictionary.GetTranslations(bgDictWord); foreach (string ruDictWord in ruDictWords) { double ruDictWordCount = ruWordContext[ruDictWord]; ruVector[i] += ruDictWordCount; } } if (useReverseContext) { // Reverse match the context vector with the dictionary word's contexts for (int i = 0; i < dictionaryBgWords.Length; i++) { double ruWordForwardCount = ruVector[i]; if (ruWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext) { string bgDictWord = dictionaryBgWords[i]; List <string> ruDictWords = BgRuDictionary.GetTranslations(bgDictWord); double ruWordReverseTotalCount = 0; foreach (string ruDictWord in ruDictWords) { WordsAndCounts ruDictWordReverseContext = SemanticSimilarityUtils.RetrieveContextWords(ruDictWord, LANG_CODE_RU); double ruWordReverseCount = GetWordCountInContext(ruWord, LANG_CODE_RU, ruDictWordReverseContext); ruWordReverseTotalCount += ruWordReverseCount; } ruVector[i] = Math.Min(ruWordForwardCount, ruWordReverseTotalCount); } else { ruVector[i] = 0; } } } // Add the calculated context vector to the cache VectorsCache.AddToCache(ruWord, LANG_CODE_RU, ruVector); return(ruVector); }
/// <summary> /// Calculates the distance between pair of words in the same language. /// </summary> public static double SemSim( string firstWord, string secondWord, string langCode) { WordsAndCounts firstWordContext; if (useReverseContext) { firstWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithReverseContext(firstWord, langCode); } else if (useQueryLemmatization) { firstWordContext = SemanticSimilarityUtils. RetrieveContextWordsForAllWordForms(firstWord, langCode); } else if (useIndirectContext) { firstWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithIndirectContext(firstWord, langCode); } else { firstWordContext = SemanticSimilarityUtils. RetrieveContextWords(firstWord, langCode); } WordsAndCounts secondWordContext; if (useReverseContext) { secondWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithReverseContext(secondWord, langCode); } else if (useQueryLemmatization) { secondWordContext = SemanticSimilarityUtils. RetrieveContextWordsForAllWordForms(secondWord, langCode); } else if (useIndirectContext) { secondWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithIndirectContext(secondWord, langCode); } else { secondWordContext = SemanticSimilarityUtils. RetrieveContextWords(secondWord, langCode); } if (useTFIDF) { ApplyTFIDFWeighting(firstWordContext, langCode); ApplyTFIDFWeighting(secondWordContext, langCode); } if (vectorDiffAlgorithm == VectorDiffAlgorithm.COSINE) { double distance = VectorUtils.CalcCosinusBetweenWordsCounts( firstWordContext, secondWordContext); return(distance); } else { throw new Exception("Algorithm not supported!"); } }