Ejemplo n.º 1
0
        public void Execute()
        {
            int count =0;
            PushToCouchDBReceiver receiver = new PushToCouchDBReceiver();
            using (XtractDataContext db = new XtractDataContext())
            {
                db.ObjectTrackingEnabled = false;
                foreach (Tweet tw in from tweet in db.Tweets
                                     where tweet.sample_reason.Equals(SampleReason.user_data.ToString())
                                     select tweet)
                {
                    string screenName = tw.screen_name;
                    long? twitter_id = tw.twitter_id;

                    foreach (string text in _tokenizer.Tokenize(tw.text))
                    {
                        Word word = new Word { screen_name = screenName, text = text, twitter_id = twitter_id };
                        if (word.IsEntity())
                        {
                            tw.AddEntity(word.text);
                        }
                    }

                    receiver.Push(tw);

                    if (++count % 1000 == 0)
                    {
                        Console.Out.WriteLine("tokenized " + count + " tweets");
                    }
                }

            }
        }
Ejemplo n.º 2
0
        public void Execute()
        {
            Console.Out.WriteLine("Parsing english model trigrams from source data in 'english_data' directory");
            LanguageModel english = ModelFactory.LoadModelFromFolder("english_data");
            Console.Out.WriteLine("Done parsing english model trigrams");

            int count = 0;
            using (XtractDataContext db = new XtractDataContext())
            {
                db.ObjectTrackingEnabled = false;
                var tweets = from tweet in db.Tweets
                             where tweet.sample_reason.Equals(SampleReason.user_data.ToString())
                             select tweet;
                foreach (Tweet tweet in tweets)
                {
                    LanguageModel smallModel = new LanguageModel(tweet.text);
                    double similarity = smallModel.Similarity(english);
                    Debug.Assert(similarity<=1.0d, "Similarity should never be more than 1");
                    db.SetTweetEnglishSimilarity(tweet.twitter_id, similarity);
                    if (++count%100 == 0)
                    {
                        Console.Out.WriteLine("{0}:{1}:'{2}'", tweet.twitter_id, similarity, tweet.text);
                        Console.Out.WriteLine("Updated similarity for " + count + " tweets");
                    }
                }
                Console.Out.WriteLine("Updated similarity for " + count + " tweets");
            }
        }
Ejemplo n.º 3
0
        public void Execute()
        {
            int count = 0;

            WebResponseBuilder responseBuilder = new WebResponseBuilder();
            string twitter_api_username = ConfigurationManager.AppSettings["twitter_user"];
            string twitter_api_password = ConfigurationManager.AppSettings["twitter_pass"];
            responseBuilder.UseCGICredentials(twitter_api_username, twitter_api_password);

            TwitterStreamStatusProvider provider = new TwitterStreamStatusProvider(responseBuilder);
            provider.YieldThisMany = NUM_STATUSES_TO_PULL;

            Console.Out.WriteLine("Parsing english model trigrams from source data in 'english_data' directory");
            EnglishStatusProvider englishProvider = new EnglishStatusProvider(provider, "english_data");
            englishProvider.Threshold = ENGLISH_THRESHOLD;
            Console.Out.WriteLine("About to start reading from twitter - up to " + NUM_STATUSES_TO_PULL + " statuses.");

            XtractDataContext db = new XtractDataContext();
            foreach (TwitterStatus status in englishProvider.GetMessages())
            {
                string screen_name = status.user.screen_name;

                var existing = db.Twusers.Where(u => u.screen_name == screen_name);
                if (existing.Count() == 0)
                {
                    Twuser user = Twuser.From(status.user);
                    user.english_similarity = status.english_similarity;
                    db.Twusers.InsertOnSubmit(user);
                }
                else
                {
                    Twuser user = existing.First();
                    if (user.english_similarity < status.english_similarity)
                    {
                        user.english_similarity = status.english_similarity;
                    }
                }

                Tweet tweet = new Tweet();
                tweet.english_similarity = status.english_similarity;
                tweet.screen_name = status.user.screen_name;
                tweet.text = status.text;
                tweet.twitter_id = status.id;
                tweet.sample_reason = SampleReason.sample_stream.ToString();
                db.Tweets.InsertOnSubmit(tweet);
                db.SubmitChanges();

                if (count++ > UPDATE_EVERY)
                {
                    Console.Out.WriteLine("Wrote " + count + " tweets.");
                }
            }
        }
Ejemplo n.º 4
0
    public void TestTweetSerializeFromDB()
    {
        //setup
        IUrlExpander expander = new UrlExpander();
        Tokenizer tokenizer = new Tokenizer(expander);

        using (XtractDataContext db = new XtractDataContext())
        {
            db.ObjectTrackingEnabled = false;
            foreach (Tweet tw in (from tweet in db.Tweets
                                 where tweet.sample_reason.Equals(SampleReason.user_data.ToString())
                                 select tweet).Take(10))
            {

                Console.Out.WriteLine("JSON:" + JSON.Serialize(tw));
            }

        }
    }
Ejemplo n.º 5
0
        private bool CheckForExisting(string screen_name)
        {
            using (XtractDataContext dbWrite = new XtractDataContext())
            {
                var existingCount = (from tweet in dbWrite.Tweets
                                     where tweet.screen_name.Equals(screen_name)
                                     select tweet).Count();

                if (existingCount > 5)
                {
                    Console.Out.WriteLine("Skipping retrieve for  @" + screen_name + " as we already have " + existingCount + " rows of data");

                    var user = (from twuser in dbWrite.Twusers
                                    where twuser.screen_name.Equals(screen_name)
                                    select twuser).First();

                    DownloadStatus lastStatus = DownloadStatus.DataDownloaded;
                    user.last_parse_status = lastStatus.ToString();
                    dbWrite.SubmitChanges();
                    return true;
                }
            }
            return false;
        }
Ejemplo n.º 6
0
        public void Execute()
        {
            int overallCount = 0;
            using (XtractDataContext dbRead = new XtractDataContext())
            {
                dbRead.ObjectTrackingEnabled = false;
                foreach (Twuser user in from twuser in dbRead.Twusers
                                        where ((twuser.id % 10 == 0) &&
                                               (twuser.last_parse_status==null))
                                        orderby twuser.english_similarity descending
                                        select twuser)
                {

                    string screen_name = user.screen_name;
                    if (CheckForExisting(screen_name)) continue;

                    OAuthTwitterResponseBuilder oAuthTwitter = new OAuthTwitterResponseBuilder();
                    UserStatusProvider provider = new UserStatusProvider(oAuthTwitter, user.screen_name);
                    Console.Out.WriteLine("About to request data for @" + user.screen_name);
                    DateTime nowish = DateTime.UtcNow;
                    int count = 0;
                    using (XtractDataContext dbWrite = new XtractDataContext())
                    {
                        foreach (TwitterStatus status in provider.GetMessages())
                        {
                            long twitter_id = status.id.Value;
                            var existing = dbWrite.Tweets.Where(tw => tw.twitter_id == twitter_id);
                            if (existing.Count() != 0) continue;

                            Tweet tweet = new Tweet();
                            //tweet.english_similarity = status.english_similarity;
                            tweet.screen_name = status.user.screen_name;
                            tweet.text = status.text;
                            tweet.twitter_id = status.id;
                            DateTime createdAt = DateUtils.UTCDateTimeFromTwitterTimeStamp(status.created_at);
                            tweet.date_tweeted = DateUtils.ISO8601TimeStampFromUTCDateTime(createdAt);
                            tweet.date_scanned = DateUtils.ISO8601TimeStampFromUTCDateTime(nowish);
                            tweet.sample_reason = SampleReason.user_data.ToString();
                            dbWrite.Tweets.InsertOnSubmit(tweet);
                            count++;
                            overallCount++;
                            if (overallCount % 100 == 0)
                            {
                                dbWrite.SubmitChanges();
                                Console.Out.WriteLine(overallCount + " tweets saved so far");
                            }
                        }
                    }

                    Console.Out.WriteLine(count + " tweets found for " + screen_name);
                    using (XtractDataContext dbWrite = new XtractDataContext())
                    {
                        var lastUser = (from twuser in dbWrite.Twusers
                                        where twuser.screen_name.Equals(screen_name)
                                        select twuser).First();

                        DownloadStatus lastStatus = (count==0)? DownloadStatus.NoDataFound : DownloadStatus.DataDownloaded;
                        lastUser.last_parse_status = lastStatus.ToString();
                        dbWrite.SubmitChanges();
                    }
                }
            }
        }