Пример #1
0
        public void Execute()
        {
            Console.Out.WriteLine("Parsing english model trigrams from source data in 'english_data' directory");
            LanguageModel english = ModelFactory.LoadModelFromFolder("english_data");
            Console.Out.WriteLine("Done parsing english model trigrams");

            int count = 0;
            using (XtractDataContext db = new XtractDataContext())
            {
                db.ObjectTrackingEnabled = false;
                var tweets = from tweet in db.Tweets
                             where tweet.sample_reason.Equals(SampleReason.user_data.ToString())
                             select tweet;
                foreach (Tweet tweet in tweets)
                {
                    LanguageModel smallModel = new LanguageModel(tweet.text);
                    double similarity = smallModel.Similarity(english);
                    Debug.Assert(similarity<=1.0d, "Similarity should never be more than 1");
                    db.SetTweetEnglishSimilarity(tweet.twitter_id, similarity);
                    if (++count%100 == 0)
                    {
                        Console.Out.WriteLine("{0}:{1}:'{2}'", tweet.twitter_id, similarity, tweet.text);
                        Console.Out.WriteLine("Updated similarity for " + count + " tweets");
                    }
                }
                Console.Out.WriteLine("Updated similarity for " + count + " tweets");
            }
        }
Пример #2
0
        private static void SimilarityTest()
        {
            List<string> examples = new List<string>() {
                "so i said, hey what are you doing?",
                "If you're wanting tickets for tomorrow night's home opener or any night at \"The Swamp\", call the 'Dogs front office at 910-426-5900",
                "with the steps required to download and use files from",
                "How bout me and u gets sum breakfast", ///hmm this is barely english though in't?
                "Fashion Terms shoes online gr dresses by flirt john surratt . http://www.lsaco.com/fashion-terms-shoes-online-gr-dresses-by-flirt/",
                "@wvpv Do let me know how you get on... am Psonar CTO. Any problems or anything we can do to make it better, just say so! Thanks!",
                "@MissFAB_LC the circus.. Aunt Cina takin Juju 2day!!!!",
                "@1989515 난 작업견이라는 말까지는 안했어,,, 그냥 음큼견이라는 거지... ㅋㅋ 넌 순수해...  응큼한게 니 순수한 멋이야... ㅋㅋㅋㅋ @aphrodite_sung @jth800 #반말한당_",
                "装gentoo的时候可以看完一本小说。当然屁长屁长的还是看不完的",
                "RT ιƒ уσυ'яє ѕιηgℓє &hearts", //actually this is 'english' but, i mean wtf?
                "Dmn tuh? RT @vaniessadh: di apartemen sendokuran  so scary ternyata http://myloc.me/7sqYQ",
                "@QhaCaembie Hahaha... Tau de yg dah bs twitteran di HP. Bilang pa u sm w. Haha",
                "@BustinnJieeber haha :) LOVE U 2 &lt;3 ♥",
                "|@Ignorancelove WO MAI HL. LATER FAT. AND THE MILK SO ERM, EEEW." };
            LanguageModel english = ModelFactory.LoadModelFromFolder("data");

            SortedList<double, string> classify = new SortedList<double, string>();
            foreach (string example in examples)
            {
                LanguageModel test = new LanguageModel(example);
                classify.Add(test.Similarity(english), example);
            }

            foreach (var example in classify)
            {
                Console.Out.WriteLine(example.Key + "|" + example.Value);
            }
        }
Пример #3
0
 public void Setup()
 {
     _english = new LanguageModel();
     _english.ParseStream(new StreamReader("data/gutenburg_en_1.txt", Encoding.UTF8));
     _english.ParseStream(new StreamReader("data/gutenburg_en_2.txt", Encoding.UTF8));
     _english.ParseStream(new StreamReader("data/gutenburg_en_3.txt", Encoding.UTF8));
     _english.ParseStream(new StreamReader("data/gutenburg_en_4.txt", Encoding.UTF8));
     _english.ParseStream(new StreamReader("data/gutenburg_en_5.txt", Encoding.UTF8));
 }
Пример #4
0
        public void LargeEnglishShouldHaveSimilarityOf1()
        {
            LanguageModel english_large = ModelFactory.LoadModelFromFolder("english_data"); // takes a long time to parse
            AssertPrettyMuchEqual(1.0, english_large.Similarity(english_large), "Expected identical large models to have similarity of 1");
            AssertPrettyMuchEqual(1.0, english_large.Similarity(english_large), "Expected identical large models to have similarity of 1, either way around");

            LanguageModel tweetModel = new LanguageModel("Aye #shoutout to all the single mothers out there doin what they gotta do to provide for their kids and themselves. u r appreciated :)");
            Assert.LessOrEqual(1.0d, tweetModel.Similarity(english_large), "expected similarity to large model to be always less than 1");
        }
Пример #5
0
        public void EqualTextShouldBeOne()
        {
            string txt = "once upon a time there was a small boy";
            LanguageModel model1 = new LanguageModel(txt);
            LanguageModel model2 = new LanguageModel(txt);

            AssertPrettyMuchEqual(1.0, model1.Similarity(model2), "Expected identical strings to have similarity of 1");
            AssertPrettyMuchEqual(1.0, model2.Similarity(model1), "Expected identical strings to have similarity of 1, either way around");
        }
Пример #6
0
        public void SimilarTextShouldCloseBeOne()
        {
            string txt1 = "once upon a time there was a small boy";
            string txt2 = "once upon a time there waz a small boy";
            LanguageModel model1 = new LanguageModel(txt1);
            LanguageModel model2 = new LanguageModel(txt2);

            Assert.GreaterOrEqual(model1.Similarity(model2), 0.09d, "Expected very similar  strings to have similarity close 1");
            Assert.LessOrEqual(model1.Similarity(model2), 1.0d, "Expected identical strings to have similarity of 1");
        }
Пример #7
0
 public void FullCourtEnglishTest()
 {
     _english = ModelFactory.LoadModelFromFolder("data");
     EnglishThresholdTest();
 }
Пример #8
0
 private void AssertNotEnglish(string text)
 {
     LanguageModel model = new LanguageModel(text);
     Console.Out.WriteLine("Similarity:" + model.Similarity(_english) + "\t Text:" + text);
        //Assert.LessOrEqual(model.Similarity(_english), 0.05d, "Expected non english classification");
 }
Пример #9
0
 private void AssertEnglish(string text)
 {
     LanguageModel model = new LanguageModel(text);
     //FIXME similarity should always be below 1 but sometimes it ends up above that wha?
     Console.Out.WriteLine("Similarity:" + model.Similarity(_english) + "\t Text:" + text);
     //Assert.GreaterOrEqual(model.Similarity(_english), 0.05d, "Expected english classification");
 }
Пример #10
0
 /// <summary>
 /// Compute similarity between two trigram models. 
 /// For speed, you should call this on the smallest model
 /// and pass the largest model.. eg twitterStatusModel.Similarity(englishModel) 
 /// </summary>
 public double Similarity(LanguageModel other)
 {
     double tmp = 0;
     foreach (var first in _trigramModel)
     {
         if (!other._trigramModel.ContainsKey(first.Key)) continue;
         var otherFirst = other._trigramModel[first.Key];
         foreach (var second in first.Value)
         {
             if (!otherFirst.ContainsKey(second.Key)) continue;
             var otherSecond = otherFirst[second.Key];
             foreach (var third in second.Value)
             {
                 if (!otherSecond.ContainsKey(third.Key)) continue;
                 tmp += third.Value * otherSecond[third.Key];
             }
         }
     }
     return tmp / (this.Length * other.Length);
 }