public void ExtractCognatesAndFalseFriends(string outputFileName)
        {
            StreamWriter writer = new StreamWriter(outputFileName, false, FILE_ENCODING);

            using (writer)
            {
                writer.WriteLine("BG Word;RU Word;Friends?;MMED Similarity;BG Count;RU Count;Corresponding Count;Friendness;Semantic Similarity");
                foreach (string bgWord in this.allBgWords.Keys)
                {
                    Console.WriteLine("Extracting words statistics: " + bgWord);
                    foreach (string ruWord in this.allRuWords.Keys)
                    {
                        double wordsSimilarity = MMEDR.CalculateSimilarity(bgWord, ruWord);
                        if (wordsSimilarity >= MIN_WORD_SIMILARITY)
                        {
                            string friendsJudgeAnswer = GetFriendsJudgeAnswer(bgWord, ruWord);
                            if (friendsJudgeAnswer != null)
                            {
                                float  bgCount            = CalculateBgCount(bgWord);
                                float  ruCount            = CalculateRuCount(ruWord);
                                float  correspondingCount = CalculateCorrespondingCount(bgWord, ruWord);
                                float  friendness         = CalculateFriendness(bgCount, ruCount, correspondingCount);
                                double semanticSimilarity = SemanticSimilarityUtils.CrossSim(bgWord, ruWord);
                                writer.WriteLine("{0};{1};{2};{3};{4};{5};{6};{7};{8}",
                                                 bgWord, ruWord, friendsJudgeAnswer, wordsSimilarity, bgCount, ruCount,
                                                 correspondingCount, friendness, semanticSimilarity);
                                writer.Flush();
                            }
                        }
                    }
                }
            }
        }
示例#2
0
        private static double[] CalculateBgDictionaryContextVector(string bgWord)
        {
            // First check the vectors cache
            if (VectorsCache.IsInCache(bgWord, LANG_CODE_BG))
            {
                double[] bgVectorFromCache = VectorsCache.LoadFromCache(bgWord, LANG_CODE_BG);
                return(bgVectorFromCache);
            }

            // Retrieve the word's local context
            WordsAndCounts bgWordContext;

            if (useIndirectContext)
            {
                bgWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsWithIndirectContext(bgWord, LANG_CODE_BG);
            }
            else if (useQueryLemmatization)
            {
                bgWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsForAllWordForms(bgWord, LANG_CODE_BG);
            }
            else
            {
                bgWordContext = SemanticSimilarityUtils.
                                RetrieveContextWords(bgWord, LANG_CODE_BG);
            }

            if (useTFIDF)
            {
                ApplyTFIDFWeighting(bgWordContext, LANG_CODE_BG);
            }

            // Analyse the word's local context and match the dictionary words in it
            string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords;
            double[] bgVector          = new double[dictionaryBgWords.Length];
            for (int i = 0; i < dictionaryBgWords.Length; i++)
            {
                string bgDictWord      = dictionaryBgWords[i];
                double bgDictWordCount = bgWordContext[bgDictWord];
                bgVector[i] = bgDictWordCount;
            }

            if (useReverseContext)
            {
                // Reverse match the context vector with the dictionary word's contexts
                for (int i = 0; i < dictionaryBgWords.Length; i++)
                {
                    double bgWordForwardCount = bgVector[i];
                    if (bgWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext)
                    {
                        string         bgDictWord = dictionaryBgWords[i];
                        WordsAndCounts bgDictWordReverseContext =
                            SemanticSimilarityUtils.RetrieveContextWords(bgDictWord, LANG_CODE_BG);
                        double bgWordReverseCount = GetWordCountInContext(bgWord, LANG_CODE_BG, bgDictWordReverseContext);
                        bgVector[i] = Math.Min(bgWordForwardCount, bgWordReverseCount);
                    }
                    else
                    {
                        bgVector[i] = 0;
                    }
                }
            }

            // Add the calculated context vector to the cache
            VectorsCache.AddToCache(bgWord, LANG_CODE_BG, bgVector);

            return(bgVector);
        }
示例#3
0
        private static double[] CalculateRuDictionaryContextVector(string ruWord)
        {
            // First check the vectors cache
            if (VectorsCache.IsInCache(ruWord, LANG_CODE_RU))
            {
                double[] ruVectorFromCache = VectorsCache.LoadFromCache(ruWord, LANG_CODE_RU);
                return(ruVectorFromCache);
            }

            // Retrieve the word's local context
            WordsAndCounts ruWordContext;

            if (useIndirectContext)
            {
                ruWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsWithIndirectContext(ruWord, LANG_CODE_RU);
            }
            else if (useQueryLemmatization)
            {
                ruWordContext = SemanticSimilarityUtils.
                                RetrieveContextWordsForAllWordForms(ruWord, LANG_CODE_RU);
            }
            else
            {
                ruWordContext = SemanticSimilarityUtils.
                                RetrieveContextWords(ruWord, LANG_CODE_RU);
            }

            if (useTFIDF)
            {
                ApplyTFIDFWeighting(ruWordContext, LANG_CODE_RU);
            }

            // Analyse the word's local context and match the dictionary words in it
            string[] dictionaryBgWords = BgRuDictionary.DictionaryBgWords;
            double[] ruVector          = new double[dictionaryBgWords.Length];
            for (int i = 0; i < dictionaryBgWords.Length; i++)
            {
                string        bgDictWord  = dictionaryBgWords[i];
                List <string> ruDictWords = BgRuDictionary.GetTranslations(bgDictWord);
                foreach (string ruDictWord in ruDictWords)
                {
                    double ruDictWordCount = ruWordContext[ruDictWord];
                    ruVector[i] += ruDictWordCount;
                }
            }

            if (useReverseContext)
            {
                // Reverse match the context vector with the dictionary word's contexts
                for (int i = 0; i < dictionaryBgWords.Length; i++)
                {
                    double ruWordForwardCount = ruVector[i];
                    if (ruWordForwardCount >= SemanticSimilarityUtils.minWordOccurencesForReverseOrIndirectContext)
                    {
                        string        bgDictWord              = dictionaryBgWords[i];
                        List <string> ruDictWords             = BgRuDictionary.GetTranslations(bgDictWord);
                        double        ruWordReverseTotalCount = 0;
                        foreach (string ruDictWord in ruDictWords)
                        {
                            WordsAndCounts ruDictWordReverseContext =
                                SemanticSimilarityUtils.RetrieveContextWords(ruDictWord, LANG_CODE_RU);
                            double ruWordReverseCount = GetWordCountInContext(ruWord, LANG_CODE_RU, ruDictWordReverseContext);
                            ruWordReverseTotalCount += ruWordReverseCount;
                        }
                        ruVector[i] = Math.Min(ruWordForwardCount, ruWordReverseTotalCount);
                    }
                    else
                    {
                        ruVector[i] = 0;
                    }
                }
            }

            // Add the calculated context vector to the cache
            VectorsCache.AddToCache(ruWord, LANG_CODE_RU, ruVector);

            return(ruVector);
        }
示例#4
0
        /// <summary>
        /// Calculates the distance between pair of words in the same language.
        /// </summary>
        public static double SemSim(
            string firstWord, string secondWord, string langCode)
        {
            WordsAndCounts firstWordContext;

            if (useReverseContext)
            {
                firstWordContext = SemanticSimilarityUtils.
                                   RetrieveContextWordsWithReverseContext(firstWord, langCode);
            }
            else if (useQueryLemmatization)
            {
                firstWordContext = SemanticSimilarityUtils.
                                   RetrieveContextWordsForAllWordForms(firstWord, langCode);
            }
            else if (useIndirectContext)
            {
                firstWordContext = SemanticSimilarityUtils.
                                   RetrieveContextWordsWithIndirectContext(firstWord, langCode);
            }
            else
            {
                firstWordContext = SemanticSimilarityUtils.
                                   RetrieveContextWords(firstWord, langCode);
            }

            WordsAndCounts secondWordContext;

            if (useReverseContext)
            {
                secondWordContext = SemanticSimilarityUtils.
                                    RetrieveContextWordsWithReverseContext(secondWord, langCode);
            }
            else if (useQueryLemmatization)
            {
                secondWordContext = SemanticSimilarityUtils.
                                    RetrieveContextWordsForAllWordForms(secondWord, langCode);
            }
            else if (useIndirectContext)
            {
                secondWordContext = SemanticSimilarityUtils.
                                    RetrieveContextWordsWithIndirectContext(secondWord, langCode);
            }
            else
            {
                secondWordContext = SemanticSimilarityUtils.
                                    RetrieveContextWords(secondWord, langCode);
            }

            if (useTFIDF)
            {
                ApplyTFIDFWeighting(firstWordContext, langCode);
                ApplyTFIDFWeighting(secondWordContext, langCode);
            }

            if (vectorDiffAlgorithm == VectorDiffAlgorithm.COSINE)
            {
                double distance = VectorUtils.CalcCosinusBetweenWordsCounts(
                    firstWordContext, secondWordContext);
                return(distance);
            }
            else
            {
                throw new Exception("Algorithm not supported!");
            }
        }