public void ExtractCognatesAndFalseFriends(string outputFileName) { StreamWriter writer = new StreamWriter(outputFileName, false, FILE_ENCODING); using (writer) { writer.WriteLine("BG Word;RU Word;Friends?;MMED Similarity;BG Count;RU Count;Corresponding Count;Friendness;Semantic Similarity"); foreach (string bgWord in this.allBgWords.Keys) { Console.WriteLine("Extracting words statistics: " + bgWord); foreach (string ruWord in this.allRuWords.Keys) { double wordsSimilarity = MMEDR.CalculateSimilarity(bgWord, ruWord); if (wordsSimilarity >= MIN_WORD_SIMILARITY) { string friendsJudgeAnswer = GetFriendsJudgeAnswer(bgWord, ruWord); if (friendsJudgeAnswer != null) { float bgCount = CalculateBgCount(bgWord); float ruCount = CalculateRuCount(ruWord); float correspondingCount = CalculateCorrespondingCount(bgWord, ruWord); float friendness = CalculateFriendness(bgCount, ruCount, correspondingCount); double semanticSimilarity = SemanticSimilarityUtils.CrossSim(bgWord, ruWord); writer.WriteLine("{0};{1};{2};{3};{4};{5};{6};{7};{8}", bgWord, ruWord, friendsJudgeAnswer, wordsSimilarity, bgCount, ruCount, correspondingCount, friendness, semanticSimilarity); writer.Flush(); } } } } } }
private static double CalcMMEDR(string bgWord, string ruWord) { // Transcribe the Bulgarian word ReplaceAll(ref bgWord, "щ", "шт"); ReplaceAll(ref bgWord, "ьо", "ё"); ReplaceAll(ref bgWord, "йо", "ё"); // Transcribe the Russian word ReplaceAll(ref ruWord, "э", "е"); ReplaceAll(ref ruWord, "щ", "сч"); ReplaceAll(ref ruWord, "ъ", ""); ReplaceAll(ref ruWord, "ы", "и"); ReplaceAll(ref ruWord, "ь", ""); // Remove double consonants in the Bulgarian word ReplaceAll(ref bgWord, "зс", "с"); // Remove double consonants in the Russian word ReplaceAll(ref ruWord, "бб", "б"); ReplaceAll(ref ruWord, "жж", "ж"); ReplaceAll(ref ruWord, "кк", "к"); ReplaceAll(ref ruWord, "лл", "л"); ReplaceAll(ref ruWord, "мм", "м"); ReplaceAll(ref ruWord, "пп", "п"); ReplaceAll(ref ruWord, "сс", "с"); ReplaceAll(ref ruWord, "фф", "ф"); // Calculate weighted Levenstain distance (MMEDR) MMEDR distCalc = new MMEDR(bgWord, ruWord); double distance = distCalc.CalcDistance(bgWord.Length, ruWord.Length); double maxLen = Math.Max(bgWord.Length, ruWord.Length); double similarity = (maxLen - distance) / maxLen; return(similarity); }