public void Execute() { Console.Out.WriteLine("Parsing english model trigrams from source data in 'english_data' directory"); LanguageModel english = ModelFactory.LoadModelFromFolder("english_data"); Console.Out.WriteLine("Done parsing english model trigrams"); int count = 0; using (XtractDataContext db = new XtractDataContext()) { db.ObjectTrackingEnabled = false; var tweets = from tweet in db.Tweets where tweet.sample_reason.Equals(SampleReason.user_data.ToString()) select tweet; foreach (Tweet tweet in tweets) { LanguageModel smallModel = new LanguageModel(tweet.text); double similarity = smallModel.Similarity(english); Debug.Assert(similarity<=1.0d, "Similarity should never be more than 1"); db.SetTweetEnglishSimilarity(tweet.twitter_id, similarity); if (++count%100 == 0) { Console.Out.WriteLine("{0}:{1}:'{2}'", tweet.twitter_id, similarity, tweet.text); Console.Out.WriteLine("Updated similarity for " + count + " tweets"); } } Console.Out.WriteLine("Updated similarity for " + count + " tweets"); } }
private static void SimilarityTest() { List<string> examples = new List<string>() { "so i said, hey what are you doing?", "If you're wanting tickets for tomorrow night's home opener or any night at \"The Swamp\", call the 'Dogs front office at 910-426-5900", "with the steps required to download and use files from", "How bout me and u gets sum breakfast", ///hmm this is barely english though in't? "Fashion Terms shoes online gr dresses by flirt john surratt . http://www.lsaco.com/fashion-terms-shoes-online-gr-dresses-by-flirt/", "@wvpv Do let me know how you get on... am Psonar CTO. Any problems or anything we can do to make it better, just say so! Thanks!", "@MissFAB_LC the circus.. Aunt Cina takin Juju 2day!!!!", "@1989515 난 작업견이라는 말까지는 안했어,,, 그냥 음큼견이라는 거지... ㅋㅋ 넌 순수해... 응큼한게 니 순수한 멋이야... ㅋㅋㅋㅋ @aphrodite_sung @jth800 #반말한당_", "装gentoo的时候可以看完一本小说。当然屁长屁长的还是看不完的", "RT ιƒ уσυ'яє ѕιηgℓє &hearts", //actually this is 'english' but, i mean wtf? "Dmn tuh? RT @vaniessadh: di apartemen sendokuran so scary ternyata http://myloc.me/7sqYQ", "@QhaCaembie Hahaha... Tau de yg dah bs twitteran di HP. Bilang pa u sm w. Haha", "@BustinnJieeber haha :) LOVE U 2 <3 ♥", "|@Ignorancelove WO MAI HL. LATER FAT. AND THE MILK SO ERM, EEEW." }; LanguageModel english = ModelFactory.LoadModelFromFolder("data"); SortedList<double, string> classify = new SortedList<double, string>(); foreach (string example in examples) { LanguageModel test = new LanguageModel(example); classify.Add(test.Similarity(english), example); } foreach (var example in classify) { Console.Out.WriteLine(example.Key + "|" + example.Value); } }
public void LargeEnglishShouldHaveSimilarityOf1() { LanguageModel english_large = ModelFactory.LoadModelFromFolder("english_data"); // takes a long time to parse AssertPrettyMuchEqual(1.0, english_large.Similarity(english_large), "Expected identical large models to have similarity of 1"); AssertPrettyMuchEqual(1.0, english_large.Similarity(english_large), "Expected identical large models to have similarity of 1, either way around"); LanguageModel tweetModel = new LanguageModel("Aye #shoutout to all the single mothers out there doin what they gotta do to provide for their kids and themselves. u r appreciated :)"); Assert.LessOrEqual(1.0d, tweetModel.Similarity(english_large), "expected similarity to large model to be always less than 1"); }
public void EqualTextShouldBeOne() { string txt = "once upon a time there was a small boy"; LanguageModel model1 = new LanguageModel(txt); LanguageModel model2 = new LanguageModel(txt); AssertPrettyMuchEqual(1.0, model1.Similarity(model2), "Expected identical strings to have similarity of 1"); AssertPrettyMuchEqual(1.0, model2.Similarity(model1), "Expected identical strings to have similarity of 1, either way around"); }
public void SimilarTextShouldCloseBeOne() { string txt1 = "once upon a time there was a small boy"; string txt2 = "once upon a time there waz a small boy"; LanguageModel model1 = new LanguageModel(txt1); LanguageModel model2 = new LanguageModel(txt2); Assert.GreaterOrEqual(model1.Similarity(model2), 0.09d, "Expected very similar strings to have similarity close 1"); Assert.LessOrEqual(model1.Similarity(model2), 1.0d, "Expected identical strings to have similarity of 1"); }
private void AssertNotEnglish(string text) { LanguageModel model = new LanguageModel(text); Console.Out.WriteLine("Similarity:" + model.Similarity(_english) + "\t Text:" + text); //Assert.LessOrEqual(model.Similarity(_english), 0.05d, "Expected non english classification"); }
private void AssertEnglish(string text) { LanguageModel model = new LanguageModel(text); //FIXME similarity should always be below 1 but sometimes it ends up above that wha? Console.Out.WriteLine("Similarity:" + model.Similarity(_english) + "\t Text:" + text); //Assert.GreaterOrEqual(model.Similarity(_english), 0.05d, "Expected english classification"); }