public static WordsAndCounts RetrieveContextWordsWithIndirectContext( string word, string langCode) { if (IndirectContextCache.IsInCache(word, langCode)) { WordsAndCounts cachedContextWordsAndCounts = IndirectContextCache.LoadFromCache(word, langCode); return(cachedContextWordsAndCounts); } WordsAndCounts contextWordsAndCounts = RetrieveContextWords(word, langCode); string[] contextWords = new string[contextWordsAndCounts.Words.Count]; contextWordsAndCounts.Words.CopyTo(contextWords, 0); double[] contextWordsCounts = new double[contextWordsAndCounts.Counts.Count]; contextWordsAndCounts.Counts.CopyTo(contextWordsCounts, 0); // Perform indirect context lookup and recalculate the occurences for (int i = 0; i < contextWords.Length; i++) { string contextWord = contextWords[i]; double occurences = contextWordsCounts[i]; if (occurences >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext) { WordsAndCounts indirectContextWordsAndCounts = RetrieveContextWords(contextWord, langCode); contextWordsAndCounts.AddAll(indirectContextWordsAndCounts); } } IndirectContextCache.AddToCache(word, langCode, contextWordsAndCounts); return(contextWordsAndCounts); }
private static WordsAndCounts RetrieveContextWordsForAllWordForms(string word, string langCode) { if (AllWordFormsContextCache.IsInCache(word, langCode)) { WordsAndCounts cachedContextWordsAndCounts = AllWordFormsContextCache.LoadFromCache(word, langCode); return(cachedContextWordsAndCounts); } // Retieve and merge the contexts of all word forms of the given word ICollection <string> allWordForms = LemmaDictionaryUtils.GetAllWordForms(word, langCode); WordsAndCounts contextWordsAndCounts = new WordsAndCounts(); foreach (string wordForm in allWordForms) { WordsAndCounts wordFormContextWordsAndCounts = RetrieveContextWords(wordForm, langCode); contextWordsAndCounts.AddAll(wordFormContextWordsAndCounts); } // Normalize the counts List <string> allContextWords = new List <string>(); allContextWords.AddRange(contextWordsAndCounts.Words); foreach (string contextWord in allContextWords) { double count = contextWordsAndCounts[contextWord]; contextWordsAndCounts[contextWord] = count / allWordForms.Count; } AllWordFormsContextCache.AddToCache(word, langCode, contextWordsAndCounts); return(contextWordsAndCounts); }
private static void ApplyTFIDFWeighting(WordsAndCounts wordContext, string langCode) { WordAndCount[] wordsAndCounts = wordContext.AsSortedArray; // Calculate the sum of all words occurences in the context double occurencesSum = 0; for (int i = 0; i < wordsAndCounts.Length; i++) { double occurences = wordsAndCounts[i].Count; occurencesSum = occurencesSum + occurences; } for (int i = 0; i < wordsAndCounts.Length; i++) { string word = wordsAndCounts[i].Word; // Calculate term frequency (TF) double termCount = wordContext[word]; double termFrequency = termCount / occurencesSum; // Calculate inverse document frequency (IDF) double wordOccurencesOnTheWeb = GetWordOccurencesOnTheWeb(word, langCode); double inverseDocumentFrequency = Math.Log(TOTAL_NUMBER_OF_WORDS_ON_THE_WEB / (1 + wordOccurencesOnTheWeb), 2); // frequency[word] = TFIDF[word] = TF[word] * IDF[word] double tfidf = termFrequency * inverseDocumentFrequency; wordContext[word] = tfidf; } }
public void AddAll(WordsAndCounts wordsAndCounts) { foreach (string word in wordsAndCounts.wordsCounts.Keys) { double count = wordsAndCounts.wordsCounts[word]; this.Add(word, count); } }
private static void ExtractWordsAround( string text, Dictionary <string, bool> allWordForms, string langCode, WordsAndCounts wordsAndCounts, List <string> phrasesToKeep) { string textLowerCase = text.ToLowerInvariant(); string textHtmlDecoded = HttpUtility.HtmlDecode(textLowerCase); string textWithoutPhrases = EncodePhrases(textHtmlDecoded, phrasesToKeep); string[] allWords; if ((langCode == LANG_CODE_BG) || (langCode == LANG_CODE_RU)) { string nonCyrillicChars = "[^�-‗א-�" + BULLET_OPERATOR + "]+"; allWords = Regex.Split(textWithoutPhrases, nonCyrillicChars); } else { string nonLatinChars = "[^A-Za-z" + BULLET_OPERATOR + "]+"; allWords = Regex.Split(textWithoutPhrases, nonLatinChars); } DecodePhrases(allWords); List <string> validWords; if (removeShortAndStopWords) { // TODO: this does not work correctly for short words like "םמ" validWords = FilterWords(allWords, langCode, allWordForms); } else { validWords = new List <string>(allWords); } bool[] wordsToInclude = new bool[validWords.Count]; for (int i = 0; i < validWords.Count; i++) { string currentWord = validWords[i]; if (allWordForms.ContainsKey(currentWord)) { int start = Math.Max(0, i - contextSize); int end = Math.Min(i + contextSize, validWords.Count - 1); for (int contextIndex = start; contextIndex <= end; contextIndex++) { wordsToInclude[contextIndex] = true; } } } for (int i = 0; i < validWords.Count; i++) { if (wordsToInclude[i]) { string wordToInclude = validWords[i]; wordsAndCounts.Add(wordToInclude, 1); } } }
public static WordsAndCounts LoadFromCache(string word, string langCode) { string cacheFileName = GetCacheFileName(word, langCode); FileStream input = new FileStream(cacheFileName, FileMode.Open); using (input) { BinaryFormatter formatter = new BinaryFormatter(); WordsAndCounts contextWords = (WordsAndCounts)formatter.Deserialize(input); return(contextWords); } }
public static void AddToCache(string word, string langCode, WordsAndCounts contextWords) { string cacheFileName = GetCacheFileName(word, langCode); FileStream output = new FileStream(cacheFileName, FileMode.Create); using (output) { BinaryFormatter formatter = new BinaryFormatter(); formatter.Serialize(output, contextWords); } }
public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode) { WordsAndCounts basicForms = new WordsAndCounts(); WordAndCount[] wordsAndCounts = words.AsSortedArray; foreach (WordAndCount word in wordsAndCounts) { List <string> basicFormWords = LemmaDictionaryUtils.GetBasicForms(word.Word, langCode); foreach (string basicWord in basicFormWords) { //double count = (double)word.Count / basicFormWords.Count; double count = (double)word.Count; basicForms.Add(basicWord, count); } } return(basicForms); }
private static double GetWordCountInContext(string word, String langCode, WordsAndCounts context) { if (useLemmatization) { double wordCount = 0; var allWordFormsCollection = LemmaDictionaryUtils.GetAllWordForms(word, langCode); foreach (string wordForm in allWordFormsCollection) { wordCount += context[wordForm]; } return(wordCount); } else { double wordCount = context[word]; return(wordCount); } }
public static double CalcCosinusBetweenWordsCounts( WordsAndCounts wordsCounts1, WordsAndCounts wordsCounts2) { // Create a list (union) of all words from the two sets of words Dictionary <string, bool> allWords = new Dictionary <string, bool>(); foreach (string word in wordsCounts1.Words) { allWords.Add(word, true); } foreach (string word in wordsCounts2.Words) { if (!allWords.ContainsKey(word)) { allWords.Add(word, true); } } // Create the first occurences vector double[] vector1 = new double[allWords.Count]; int index1 = 0; foreach (string word in allWords.Keys) { vector1[index1] = wordsCounts1[word]; index1++; } // Create the second occurences vector double[] vector2 = new double[allWords.Count]; int index2 = 0; foreach (string word in allWords.Keys) { vector2[index2] = wordsCounts2[word]; index2++; } double distance = VectorUtils.CalcCosinusBetweenVectors(vector1, vector2); return(distance); }
public static WordsAndCounts RetrieveContextWords(string word, string langCode) { if (ContextWordsCache.IsInCache(word, langCode)) { WordsAndCounts cachedContextWords = ContextWordsCache.LoadFromCache(word, langCode); return(cachedContextWords); } word = word.ToLowerInvariant(); WordsAndCounts contextWords = new WordsAndCounts(); List <string> titleSentences, textSentences; RetrieveWebContextSentences(word, langCode, out titleSentences, out textSentences); Dictionary <string, bool> allWordFormsDict = GetAllWordFormsDictionary(word, langCode); List <string> phrasesToKeep = new List <string>(); phrasesToKeep.AddRange(allWordFormsDict.Keys); foreach (string titleSentence in titleSentences) { ExtractWordsAround(titleSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep); } foreach (string textSentence in textSentences) { ExtractWordsAround(textSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep); } if (useLemmatization) { WordsAndCounts contextWordsBasicForms = LemmaDictionaryUtils.GetBasicForms(contextWords, langCode); ContextWordsCache.AddToCache(word, langCode, contextWordsBasicForms); return(contextWordsBasicForms); } else { ContextWordsCache.AddToCache(word, langCode, contextWords); return(contextWords); } }
public static WordsAndCounts RetrieveContextWordsWithReverseContext( string word, string langCode) { if (ReversedContextCache.IsInCache(word, langCode)) { WordsAndCounts cachedContextWordsAndCounts = ReversedContextCache.LoadFromCache(word, langCode); return(cachedContextWordsAndCounts); } WordsAndCounts contextWordsAndCounts = RetrieveContextWords(word, langCode); string[] contextWords = new string[contextWordsAndCounts.Words.Count]; contextWordsAndCounts.Words.CopyTo(contextWords, 0); // Perform reverse context lookup and recalculate the occurences foreach (string contextWord in contextWords) { double forwardCount = contextWordsAndCounts[contextWord]; if (forwardCount < SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext) { contextWordsAndCounts.RemoveWord(contextWord); } else { WordsAndCounts reverseContextWordsAndCounts = RetrieveContextWords(contextWord, langCode); double reverseCount = reverseContextWordsAndCounts[word]; double newOccurencesCount = Math.Min(forwardCount, reverseCount); contextWordsAndCounts[contextWord] = newOccurencesCount; } } ReversedContextCache.AddToCache(word, langCode, contextWordsAndCounts); return(contextWordsAndCounts); }
private static double[] CalculateRuDictionaryContextVector(string ruWord) { // First check the vectors cache if (VectorsCache.IsInCache(ruWord, LANG_CODE_RU)) { double[] ruVectorFromCache = VectorsCache.LoadFromCache(ruWord, LANG_CODE_RU); return(ruVectorFromCache); } // Retrieve the word's local context WordsAndCounts ruWordContext; if (useIndirectContext) { ruWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithIndirectContext(ruWord, LANG_CODE_RU); } else if (useQueryLemmatization) { ruWordContext = SemanticSimilarityUtils. RetrieveContextWordsForAllWordForms(ruWord, LANG_CODE_RU); } else { ruWordContext = SemanticSimilarityUtils. RetrieveContextWords(ruWord, LANG_CODE_RU); } if (useTFIDF) { ApplyTFIDFWeighting(ruWordContext, LANG_CODE_RU); } // Analyse the word's local context and match the dictionary words in it string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords; double[] ruVector = new double[dictionaryBgWords.Length]; for (int i = 0; i < dictionaryBgWords.Length; i++) { string bgDictWord = dictionaryBgWords[i]; List <string> ruDictWords = BgRuDictionary.GetTranslations(bgDictWord); foreach (string ruDictWord in ruDictWords) { double ruDictWordCount = ruWordContext[ruDictWord]; ruVector[i] += ruDictWordCount; } } if (useReverseContext) { // Reverse match the context vector with the dictionary word's contexts for (int i = 0; i < dictionaryBgWords.Length; i++) { double ruWordForwardCount = ruVector[i]; if (ruWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext) { string bgDictWord = dictionaryBgWords[i]; List <string> ruDictWords = BgRuDictionary.GetTranslations(bgDictWord); double ruWordReverseTotalCount = 0; foreach (string ruDictWord in ruDictWords) { WordsAndCounts ruDictWordReverseContext = SemanticSimilarityUtils.RetrieveContextWords(ruDictWord, LANG_CODE_RU); double ruWordReverseCount = GetWordCountInContext(ruWord, LANG_CODE_RU, ruDictWordReverseContext); ruWordReverseTotalCount += ruWordReverseCount; } ruVector[i] = Math.Min(ruWordForwardCount, ruWordReverseTotalCount); } else { ruVector[i] = 0; } } } // Add the calculated context vector to the cache VectorsCache.AddToCache(ruWord, LANG_CODE_RU, ruVector); return(ruVector); }
private static double[] CalculateBgDictionaryContextVector(string bgWord) { // First check the vectors cache if (VectorsCache.IsInCache(bgWord, LANG_CODE_BG)) { double[] bgVectorFromCache = VectorsCache.LoadFromCache(bgWord, LANG_CODE_BG); return(bgVectorFromCache); } // Retrieve the word's local context WordsAndCounts bgWordContext; if (useIndirectContext) { bgWordContext = SemanticSimilarityUtils. RetrieveContextWordsWithIndirectContext(bgWord, LANG_CODE_BG); } else if (useQueryLemmatization) { bgWordContext = SemanticSimilarityUtils. RetrieveContextWordsForAllWordForms(bgWord, LANG_CODE_BG); } else { bgWordContext = SemanticSimilarityUtils. RetrieveContextWords(bgWord, LANG_CODE_BG); } if (useTFIDF) { ApplyTFIDFWeighting(bgWordContext, LANG_CODE_BG); } // Analyse the word's local context and match the dictionary words in it string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords; double[] bgVector = new double[dictionaryBgWords.Length]; for (int i = 0; i < dictionaryBgWords.Length; i++) { string bgDictWord = dictionaryBgWords[i]; double bgDictWordCount = bgWordContext[bgDictWord]; bgVector[i] = bgDictWordCount; } if (useReverseContext) { // Reverse match the context vector with the dictionary word's contexts for (int i = 0; i < dictionaryBgWords.Length; i++) { double bgWordForwardCount = bgVector[i]; if (bgWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext) { string bgDictWord = dictionaryBgWords[i]; WordsAndCounts bgDictWordReverseContext = SemanticSimilarityUtils.RetrieveContextWords(bgDictWord, LANG_CODE_BG); double bgWordReverseCount = GetWordCountInContext(bgWord, LANG_CODE_BG, bgDictWordReverseContext); bgVector[i] = Math.Min(bgWordForwardCount, bgWordReverseCount); } else { bgVector[i] = 0; } } } // Add the calculated context vector to the cache VectorsCache.AddToCache(bgWord, LANG_CODE_BG, bgVector); return(bgVector); }