Exemple #1
0
        public static WordsAndCounts RetrieveContextWordsWithIndirectContext(
            string word, string langCode)
        {
            if (IndirectContextCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWordsAndCounts =
                    IndirectContextCache.LoadFromCache(word, langCode);
                return(cachedContextWordsAndCounts);
            }

            WordsAndCounts contextWordsAndCounts = RetrieveContextWords(word, langCode);

            string[] contextWords = new string[contextWordsAndCounts.Words.Count];
            contextWordsAndCounts.Words.CopyTo(contextWords, 0);
            double[] contextWordsCounts = new double[contextWordsAndCounts.Counts.Count];
            contextWordsAndCounts.Counts.CopyTo(contextWordsCounts, 0);

            // Perform indirect context lookup and recalculate the occurences
            for (int i = 0; i < contextWords.Length; i++)
            {
                string contextWord = contextWords[i];
                double occurences  = contextWordsCounts[i];
                if (occurences >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext)
                {
                    WordsAndCounts indirectContextWordsAndCounts =
                        RetrieveContextWords(contextWord, langCode);
                    contextWordsAndCounts.AddAll(indirectContextWordsAndCounts);
                }
            }

            IndirectContextCache.AddToCache(word, langCode, contextWordsAndCounts);

            return(contextWordsAndCounts);
        }
Exemple #2
0
        private static WordsAndCounts RetrieveContextWordsForAllWordForms(string word, string langCode)
        {
            if (AllWordFormsContextCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWordsAndCounts =
                    AllWordFormsContextCache.LoadFromCache(word, langCode);
                return(cachedContextWordsAndCounts);
            }

            // Retieve and merge the contexts of all word forms of the given word
            ICollection <string> allWordForms          = LemmaDictionaryUtils.GetAllWordForms(word, langCode);
            WordsAndCounts       contextWordsAndCounts = new WordsAndCounts();

            foreach (string wordForm in allWordForms)
            {
                WordsAndCounts wordFormContextWordsAndCounts =
                    RetrieveContextWords(wordForm, langCode);
                contextWordsAndCounts.AddAll(wordFormContextWordsAndCounts);
            }

            // Normalize the counts
            List <string> allContextWords = new List <string>();

            allContextWords.AddRange(contextWordsAndCounts.Words);
            foreach (string contextWord in allContextWords)
            {
                double count = contextWordsAndCounts[contextWord];
                contextWordsAndCounts[contextWord] = count / allWordForms.Count;
            }

            AllWordFormsContextCache.AddToCache(word, langCode, contextWordsAndCounts);

            return(contextWordsAndCounts);
        }
Exemple #3
0
        private static void ApplyTFIDFWeighting(WordsAndCounts wordContext, string langCode)
        {
            WordAndCount[] wordsAndCounts = wordContext.AsSortedArray;

            // Calculate the sum of all words occurences in the context
            double occurencesSum = 0;

            for (int i = 0; i < wordsAndCounts.Length; i++)
            {
                double occurences = wordsAndCounts[i].Count;
                occurencesSum = occurencesSum + occurences;
            }

            for (int i = 0; i < wordsAndCounts.Length; i++)
            {
                string word = wordsAndCounts[i].Word;

                // Calculate term frequency (TF)
                double termCount     = wordContext[word];
                double termFrequency = termCount / occurencesSum;

                // Calculate inverse document frequency (IDF)
                double wordOccurencesOnTheWeb   = GetWordOccurencesOnTheWeb(word, langCode);
                double inverseDocumentFrequency =
                    Math.Log(TOTAL_NUMBER_OF_WORDS_ON_THE_WEB / (1 + wordOccurencesOnTheWeb), 2);

                // frequency[word] = TFIDF[word] = TF[word] * IDF[word]
                double tfidf = termFrequency * inverseDocumentFrequency;
                wordContext[word] = tfidf;
            }
        }
Exemple #4
0
 public void AddAll(WordsAndCounts wordsAndCounts)
 {
     foreach (string word in wordsAndCounts.wordsCounts.Keys)
     {
         double count = wordsAndCounts.wordsCounts[word];
         this.Add(word, count);
     }
 }
Exemple #5
0
        private static void ExtractWordsAround(
            string text, Dictionary <string, bool> allWordForms, string langCode,
            WordsAndCounts wordsAndCounts, List <string> phrasesToKeep)
        {
            string textLowerCase      = text.ToLowerInvariant();
            string textHtmlDecoded    = HttpUtility.HtmlDecode(textLowerCase);
            string textWithoutPhrases = EncodePhrases(textHtmlDecoded, phrasesToKeep);

            string[] allWords;
            if ((langCode == LANG_CODE_BG) || (langCode == LANG_CODE_RU))
            {
                string nonCyrillicChars = "[^�-‗א-�" + BULLET_OPERATOR + "]+";
                allWords = Regex.Split(textWithoutPhrases, nonCyrillicChars);
            }
            else
            {
                string nonLatinChars = "[^A-Za-z" + BULLET_OPERATOR + "]+";
                allWords = Regex.Split(textWithoutPhrases, nonLatinChars);
            }
            DecodePhrases(allWords);

            List <string> validWords;

            if (removeShortAndStopWords)
            {
                // TODO: this does not work correctly for short words like "םמ"
                validWords = FilterWords(allWords, langCode, allWordForms);
            }
            else
            {
                validWords = new List <string>(allWords);
            }

            bool[] wordsToInclude = new bool[validWords.Count];
            for (int i = 0; i < validWords.Count; i++)
            {
                string currentWord = validWords[i];
                if (allWordForms.ContainsKey(currentWord))
                {
                    int start = Math.Max(0, i - contextSize);
                    int end   = Math.Min(i + contextSize, validWords.Count - 1);
                    for (int contextIndex = start; contextIndex <= end; contextIndex++)
                    {
                        wordsToInclude[contextIndex] = true;
                    }
                }
            }
            for (int i = 0; i < validWords.Count; i++)
            {
                if (wordsToInclude[i])
                {
                    string wordToInclude = validWords[i];
                    wordsAndCounts.Add(wordToInclude, 1);
                }
            }
        }
Exemple #6
0
        public static WordsAndCounts LoadFromCache(string word, string langCode)
        {
            string     cacheFileName = GetCacheFileName(word, langCode);
            FileStream input         = new FileStream(cacheFileName, FileMode.Open);

            using (input)
            {
                BinaryFormatter formatter    = new BinaryFormatter();
                WordsAndCounts  contextWords = (WordsAndCounts)formatter.Deserialize(input);
                return(contextWords);
            }
        }
Exemple #7
0
        public static void AddToCache(string word, string langCode,
                                      WordsAndCounts contextWords)
        {
            string     cacheFileName = GetCacheFileName(word, langCode);
            FileStream output        = new FileStream(cacheFileName, FileMode.Create);

            using (output)
            {
                BinaryFormatter formatter = new BinaryFormatter();
                formatter.Serialize(output, contextWords);
            }
        }
        public static WordsAndCounts GetBasicForms(WordsAndCounts words, string langCode)
        {
            WordsAndCounts basicForms = new WordsAndCounts();

            WordAndCount[] wordsAndCounts = words.AsSortedArray;
            foreach (WordAndCount word in wordsAndCounts)
            {
                List <string> basicFormWords =
                    LemmaDictionaryUtils.GetBasicForms(word.Word, langCode);
                foreach (string basicWord in basicFormWords)
                {
                    //double count = (double)word.Count / basicFormWords.Count;
                    double count = (double)word.Count;
                    basicForms.Add(basicWord, count);
                }
            }
            return(basicForms);
        }
Exemple #9
0
 private static double GetWordCountInContext(string word, String langCode, WordsAndCounts context)
 {
     if (useLemmatization)
     {
         double wordCount = 0;
         var    allWordFormsCollection =
             LemmaDictionaryUtils.GetAllWordForms(word, langCode);
         foreach (string wordForm in allWordFormsCollection)
         {
             wordCount += context[wordForm];
         }
         return(wordCount);
     }
     else
     {
         double wordCount = context[word];
         return(wordCount);
     }
 }
        public static double CalcCosinusBetweenWordsCounts(
            WordsAndCounts wordsCounts1, WordsAndCounts wordsCounts2)
        {
            // Create a list (union) of all words from the two sets of words
            Dictionary <string, bool> allWords = new Dictionary <string, bool>();

            foreach (string word in wordsCounts1.Words)
            {
                allWords.Add(word, true);
            }
            foreach (string word in wordsCounts2.Words)
            {
                if (!allWords.ContainsKey(word))
                {
                    allWords.Add(word, true);
                }
            }

            // Create the first occurences vector
            double[] vector1 = new double[allWords.Count];
            int      index1  = 0;

            foreach (string word in allWords.Keys)
            {
                vector1[index1] = wordsCounts1[word];
                index1++;
            }

            // Create the second occurences vector
            double[] vector2 = new double[allWords.Count];
            int      index2  = 0;

            foreach (string word in allWords.Keys)
            {
                vector2[index2] = wordsCounts2[word];
                index2++;
            }

            double distance = VectorUtils.CalcCosinusBetweenVectors(vector1, vector2);

            return(distance);
        }
Exemple #11
0
        public static WordsAndCounts RetrieveContextWords(string word, string langCode)
        {
            if (ContextWordsCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWords =
                    ContextWordsCache.LoadFromCache(word, langCode);
                return(cachedContextWords);
            }

            word = word.ToLowerInvariant();
            WordsAndCounts contextWords = new WordsAndCounts();
            List <string>  titleSentences, textSentences;

            RetrieveWebContextSentences(word, langCode, out titleSentences, out textSentences);
            Dictionary <string, bool> allWordFormsDict = GetAllWordFormsDictionary(word, langCode);
            List <string>             phrasesToKeep    = new List <string>();

            phrasesToKeep.AddRange(allWordFormsDict.Keys);
            foreach (string titleSentence in titleSentences)
            {
                ExtractWordsAround(titleSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep);
            }
            foreach (string textSentence in textSentences)
            {
                ExtractWordsAround(textSentence, allWordFormsDict, langCode, contextWords, phrasesToKeep);
            }

            if (useLemmatization)
            {
                WordsAndCounts contextWordsBasicForms =
                    LemmaDictionaryUtils.GetBasicForms(contextWords, langCode);
                ContextWordsCache.AddToCache(word, langCode, contextWordsBasicForms);
                return(contextWordsBasicForms);
            }
            else
            {
                ContextWordsCache.AddToCache(word, langCode, contextWords);
                return(contextWords);
            }
        }
Exemple #12
0
        public static WordsAndCounts RetrieveContextWordsWithReverseContext(
            string word, string langCode)
        {
            if (ReversedContextCache.IsInCache(word, langCode))
            {
                WordsAndCounts cachedContextWordsAndCounts =
                    ReversedContextCache.LoadFromCache(word, langCode);
                return(cachedContextWordsAndCounts);
            }

            WordsAndCounts contextWordsAndCounts =
                RetrieveContextWords(word, langCode);

            string[] contextWords = new string[contextWordsAndCounts.Words.Count];
            contextWordsAndCounts.Words.CopyTo(contextWords, 0);

            // Perform reverse context lookup and recalculate the occurences
            foreach (string contextWord in contextWords)
            {
                double forwardCount = contextWordsAndCounts[contextWord];

                if (forwardCount < SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext)
                {
                    contextWordsAndCounts.RemoveWord(contextWord);
                }
                else
                {
                    WordsAndCounts reverseContextWordsAndCounts =
                        RetrieveContextWords(contextWord, langCode);
                    double reverseCount       = reverseContextWordsAndCounts[word];
                    double newOccurencesCount = Math.Min(forwardCount, reverseCount);
                    contextWordsAndCounts[contextWord] = newOccurencesCount;
                }
            }

            ReversedContextCache.AddToCache(word, langCode, contextWordsAndCounts);

            return(contextWordsAndCounts);
        }
Exemple #13
0
        private static double[] CalculateRuDictionaryContextVector(string ruWord)
        {
            // First check the vectors cache
            if (VectorsCache.IsInCache(ruWord, LANG_CODE_RU))
            {
                double[] ruVectorFromCache = VectorsCache.LoadFromCache(ruWord, LANG_CODE_RU);
                return(ruVectorFromCache);
            }

            // Retrieve the word's local context
            WordsAndCounts ruWordContext;

            if (useIndirectContext)
            {
                ruWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsWithIndirectContext(ruWord, LANG_CODE_RU);
            }
            else if (useQueryLemmatization)
            {
                ruWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsForAllWordForms(ruWord, LANG_CODE_RU);
            }
            else
            {
                ruWordContext = SemanticSimilarityUtils.
                                RetrieveContextWords(ruWord, LANG_CODE_RU);
            }

            if (useTFIDF)
            {
                ApplyTFIDFWeighting(ruWordContext, LANG_CODE_RU);
            }

            // Analyse the word's local context and match the dictionary words in it
            string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords;
            double[] ruVector          = new double[dictionaryBgWords.Length];
            for (int i = 0; i < dictionaryBgWords.Length; i++)
            {
                string        bgDictWord  = dictionaryBgWords[i];
                List <string> ruDictWords = BgRuDictionary.GetTranslations(bgDictWord);
                foreach (string ruDictWord in ruDictWords)
                {
                    double ruDictWordCount = ruWordContext[ruDictWord];
                    ruVector[i] += ruDictWordCount;
                }
            }

            if (useReverseContext)
            {
                // Reverse match the context vector with the dictionary word's contexts
                for (int i = 0; i < dictionaryBgWords.Length; i++)
                {
                    double ruWordForwardCount = ruVector[i];
                    if (ruWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext)
                    {
                        string        bgDictWord              = dictionaryBgWords[i];
                        List <string> ruDictWords             = BgRuDictionary.GetTranslations(bgDictWord);
                        double        ruWordReverseTotalCount = 0;
                        foreach (string ruDictWord in ruDictWords)
                        {
                            WordsAndCounts ruDictWordReverseContext =
                                SemanticSimilarityUtils.RetrieveContextWords(ruDictWord, LANG_CODE_RU);
                            double ruWordReverseCount = GetWordCountInContext(ruWord, LANG_CODE_RU, ruDictWordReverseContext);
                            ruWordReverseTotalCount += ruWordReverseCount;
                        }
                        ruVector[i] = Math.Min(ruWordForwardCount, ruWordReverseTotalCount);
                    }
                    else
                    {
                        ruVector[i] = 0;
                    }
                }
            }

            // Add the calculated context vector to the cache
            VectorsCache.AddToCache(ruWord, LANG_CODE_RU, ruVector);

            return(ruVector);
        }
Exemple #14
0
        private static double[] CalculateBgDictionaryContextVector(string bgWord)
        {
            // First check the vectors cache
            if (VectorsCache.IsInCache(bgWord, LANG_CODE_BG))
            {
                double[] bgVectorFromCache = VectorsCache.LoadFromCache(bgWord, LANG_CODE_BG);
                return(bgVectorFromCache);
            }

            // Retrieve the word's local context
            WordsAndCounts bgWordContext;

            if (useIndirectContext)
            {
                bgWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsWithIndirectContext(bgWord, LANG_CODE_BG);
            }
            else if (useQueryLemmatization)
            {
                bgWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsForAllWordForms(bgWord, LANG_CODE_BG);
            }
            else
            {
                bgWordContext = SemanticSimilarityUtils.
                                RetrieveContextWords(bgWord, LANG_CODE_BG);
            }

            if (useTFIDF)
            {
                ApplyTFIDFWeighting(bgWordContext, LANG_CODE_BG);
            }

            // Analyse the word's local context and match the dictionary words in it
            string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords;
            double[] bgVector          = new double[dictionaryBgWords.Length];
            for (int i = 0; i < dictionaryBgWords.Length; i++)
            {
                string bgDictWord      = dictionaryBgWords[i];
                double bgDictWordCount = bgWordContext[bgDictWord];
                bgVector[i] = bgDictWordCount;
            }

            if (useReverseContext)
            {
                // Reverse match the context vector with the dictionary word's contexts
                for (int i = 0; i < dictionaryBgWords.Length; i++)
                {
                    double bgWordForwardCount = bgVector[i];
                    if (bgWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext)
                    {
                        string         bgDictWord = dictionaryBgWords[i];
                        WordsAndCounts bgDictWordReverseContext =
                            SemanticSimilarityUtils.RetrieveContextWords(bgDictWord, LANG_CODE_BG);
                        double bgWordReverseCount = GetWordCountInContext(bgWord, LANG_CODE_BG, bgDictWordReverseContext);
                        bgVector[i] = Math.Min(bgWordForwardCount, bgWordReverseCount);
                    }
                    else
                    {
                        bgVector[i] = 0;
                    }
                }
            }

            // Add the calculated context vector to the cache
            VectorsCache.AddToCache(bgWord, LANG_CODE_BG, bgVector);

            return(bgVector);
        }