public void ut_170723_test_recompile_corpus_simple_cat_corpus() { var textConverter = new TextConverter(); // 5 recompile the dict textConverter.CatCorpusDict(); }
public void ut_170721_test_recompile_corpus_full_loop() { // var textConverter = new TextConverter(); String twinglyApi15Url = "https://data.twingly.net/socialfeed/a/api/v1.5/"; String twinglyApiKey = "2A4CF6A4-4968-46EF-862F-2881EF597A55"; // String arabiziKeyword = "netrecheh"; // > 2000 variantes !!? String arabiziKeyword = "makatjich"; // > 448 variantes !!? // assert it is not converted at start var arabicKeyword = textConverter.Convert(arabiziKeyword); Assert.AreEqual(arabiziKeyword, arabicKeyword); // 1 get all variants var variants = textConverter.GetAllTranscriptions(arabiziKeyword); // 2 get most popular keyword var mostPopularKeyword = OADRJNLPCommon.Business.Business.getMostPopularVariantFromFBViaTwingly(variants, twinglyApi15Url, twinglyApiKey); // 3 get a post containthing this keyword var postText = OADRJNLPCommon.Business.Business.getPostBasedOnKeywordFromFBViaTwingly(mostPopularKeyword, twinglyApi15Url, twinglyApiKey, true); // 4 add this post to dict new TextFrequency().AddPhraseToCorpus(postText); // 5 recompile the dict textConverter.CatCorpusDict(); textConverter.SrilmLmDict(); // assert it is now converted arabicKeyword = textConverter.Convert(arabiziKeyword); Assert.AreEqual(mostPopularKeyword, arabicKeyword); }
public void ut_170727_test_recompile_corpus_rtbdaw_under_40_variants() { // 1 arabizi String arabiziKeyword = "rtbdaw"; // 2 convert first pass var textConverter = new TextConverter(); String twinglyApi15Url = "https://data.twingly.net/socialfeed/a/api/v1.5/"; String twinglyApiKey = "2A4CF6A4-4968-46EF-862F-2881EF597A55"; // 4 get all variants var variants = textConverter.GetAllTranscriptions(/*miniA*/ arabiziKeyword); Assert.IsTrue(variants.Count < 40); // 5 get most popular keyword var mostPopularKeyword = OADRJNLPCommon.Business.Business.getMostPopularVariantFromFBViaTwingly(variants, twinglyApi15Url, twinglyApiKey); Assert.AreNotEqual(String.Empty, mostPopularKeyword, "most popular"); // 7 get a post containing this keyword var postText = OADRJNLPCommon.Business.Business.getPostBasedOnKeywordFromFBViaTwingly(mostPopularKeyword, twinglyApi15Url, twinglyApiKey, true); if (postText == String.Empty) // if no results, look everywhere { postText = OADRJNLPCommon.Business.Business.getPostBasedOnKeywordFromFBViaTwingly(mostPopularKeyword, twinglyApi15Url, twinglyApiKey, false); } Assert.AreNotEqual(String.Empty, postText, "post"); // 8 add this post to dict var textFrequency = new TextFrequency(); if (textFrequency.CorpusContainsSentence(postText) == false) { textFrequency.AddPhraseToCorpus(postText); } // 9 recompile the dict textConverter.CatCorpusDict(); textConverter.SrilmLmDict(); // 10 assert it is now converted var arabicKeyword = textConverter.Convert(arabiziKeyword); // Assert.AreEqual(completeArabicKeyword, arabicKeyword); Assert.AreEqual(mostPopularKeyword, arabicKeyword); }
public void ut_170727_test_recompile_corpus_full_loop_hazka_under_35_variants() { // this one is the sentence supposedely returned by twingly for keyword : suposoefely one line var expectedpostText = @"انا القوة الخارقة لي عندي هي فاش كندوز من حدا شي سعاي و تيقولي شي درهم الله يرحم الواليدين الله ينجحك الله يطول فعمرك .. تنقول امين فنفسي و تنزطم .. بحال الى خديت دعوة فابور .. و متنعطيهش درهم حيت تنكون حازق و يلا كانت عندي 2 دراهم تنصرفها و تنعطي لواحد درهم حتى كيدعي معايا و تنعطي لشي واحد اخر .. ليكونومي"; // 0 drop the test pharase from the dict var textFrequency = new TextFrequency(); textFrequency.DropPhraseFromCorpus(expectedpostText); // 1 arabizi String arabizi = "Al houb wa al hazka"; String arabiziKeyword = "hazka"; // 2 convert first pass var textConverter = new TextConverter(); String twinglyApi15Url = "https://data.twingly.net/socialfeed/a/api/v1.5/"; String twinglyApiKey = "2A4CF6A4-4968-46EF-862F-2881EF597A55"; var arabic = textConverter.Convert(arabizi); // 2 latin words var matches = TextTools.ExtractLatinWords(arabic); Assert.AreEqual(arabiziKeyword, matches[0].Value); // 3 preprocess if ma/ch /*String pattern = @"\bma(.+)ch\b"; * String miniArabiziKeyword = Regex.Replace(arabiziKeyword, pattern, "$1"); * Assert.AreEqual("katji", miniArabiziKeyword);*/ // 4 get all variants var variants = textConverter.GetAllTranscriptions(/*miniA*/ arabiziKeyword); Assert.IsTrue(variants.Count < 50); // 5 get most popular keyword var mostPopularKeyword = OADRJNLPCommon.Business.Business.getMostPopularVariantFromFBViaTwingly(variants, twinglyApi15Url, twinglyApiKey); /*var expectedmostPopularKeyword = "متلات"; * Assert.AreEqual(expectedmostPopularKeyword, mostPopularKeyword);*/ // 6 re-add "ma" & "ch" // var completeArabicKeyword = "ما" + mostPopularKeyword + "ش"; // Assert.AreEqual("ماكتجيش", completeArabicKeyword); // 7 get a post containing this keyword // var postText = OADRJNLPCommon.Business.Business.getPostBasedOnKeywordFromFBViaTwingly(completeArabicKeyword, twinglyApi15Url, twinglyApiKey, true); var postText = OADRJNLPCommon.Business.Business.getPostBasedOnKeywordFromFBViaTwingly(mostPopularKeyword, twinglyApi15Url, twinglyApiKey, true); Assert.AreEqual(expectedpostText, postText); // 8 add this post to dict textFrequency.AddPhraseToCorpus(postText); // 9 recompile the dict textConverter.CatCorpusDict(); textConverter.SrilmLmDict(); // 10 assert it is now converted var arabicKeyword = textConverter.Convert(arabiziKeyword); // Assert.AreEqual(completeArabicKeyword, arabicKeyword); Assert.AreEqual(mostPopularKeyword, arabicKeyword); }
public void ut_170725_test_recompile_corpus_full_loop_makatjich_under_50_variants() { // this one is the sentence supposedely returned by twingly for keyword ماكتجيش var expectedpostText = @"هاد البومب ماكتجيش عشوائية 😁 ، كتوجد ليها من الليلة ديال لبارح ✍️ يا بومبييناااي 💪"; // make it one line var onelineexpectedpostText = expectedpostText.Replace("\r\n", " "); var expectedonelineexpectedpostText = "هاد البومب ماكتجيش عشوائية 😁 ، كتوجد ليها من الليلة ديال لبارح ✍️ يا بومبييناااي 💪"; Assert.AreEqual(expectedonelineexpectedpostText, onelineexpectedpostText); // 0 drop the test pharase from the dict var textFrequency = new TextFrequency(); textFrequency.DropPhraseFromCorpus(onelineexpectedpostText); // 1 arabizi String arabizi = "Ya wlad lkhab nta li kadwi makatjich lwa9afat w kadwi ya terikt jradistat"; String arabiziKeyword = "makatjich"; // > 448 variantes !!? // 2 convert first pass var textConverter = new TextConverter(); String twinglyApi15Url = "https://data.twingly.net/socialfeed/a/api/v1.5/"; String twinglyApiKey = "2A4CF6A4-4968-46EF-862F-2881EF597A55"; var arabic = textConverter.Convert(arabizi); // 2 latin words var matches = TextTools.ExtractLatinWords(arabic); Assert.AreEqual(arabiziKeyword, matches[0].Value); // 3 preprocess if ma/ch String pattern = @"\bma(.+)ch\b"; String miniArabiziKeyword = Regex.Replace(arabiziKeyword, pattern, "$1"); Assert.AreEqual("katji", miniArabiziKeyword); // 4 get all variants var variants = textConverter.GetAllTranscriptions(miniArabiziKeyword); Assert.IsTrue(variants.Count < 50); // 5 get most popular keyword var mostPopularKeyword = OADRJNLPCommon.Business.Business.getMostPopularVariantFromFBViaTwingly(variants, twinglyApi15Url, twinglyApiKey); var expectedmostPopularKeyword = "كتجي"; Assert.AreEqual(expectedmostPopularKeyword, mostPopularKeyword); // 6 re-add "ma" & "ch" var completeArabicKeyword = "ما" + mostPopularKeyword + "ش"; Assert.AreEqual("ماكتجيش", completeArabicKeyword); // 7 get a post containing this keyword var postText = OADRJNLPCommon.Business.Business.getPostBasedOnKeywordFromFBViaTwingly(completeArabicKeyword, twinglyApi15Url, twinglyApiKey, true); Assert.AreEqual(onelineexpectedpostText, postText); // 8 add this post to dict textFrequency.AddPhraseToCorpus(postText); // 9 recompile the dict textConverter.CatCorpusDict(); textConverter.SrilmLmDict(); // 10 assert it is now converted var arabicKeyword = textConverter.Convert(arabiziKeyword); Assert.AreEqual(completeArabicKeyword, arabicKeyword); }