public void Execute() { int count =0; PushToCouchDBReceiver receiver = new PushToCouchDBReceiver(); using (XtractDataContext db = new XtractDataContext()) { db.ObjectTrackingEnabled = false; foreach (Tweet tw in from tweet in db.Tweets where tweet.sample_reason.Equals(SampleReason.user_data.ToString()) select tweet) { string screenName = tw.screen_name; long? twitter_id = tw.twitter_id; foreach (string text in _tokenizer.Tokenize(tw.text)) { Word word = new Word { screen_name = screenName, text = text, twitter_id = twitter_id }; if (word.IsEntity()) { tw.AddEntity(word.text); } } receiver.Push(tw); if (++count % 1000 == 0) { Console.Out.WriteLine("tokenized " + count + " tweets"); } } } }
public void Execute() { Console.Out.WriteLine("Parsing english model trigrams from source data in 'english_data' directory"); LanguageModel english = ModelFactory.LoadModelFromFolder("english_data"); Console.Out.WriteLine("Done parsing english model trigrams"); int count = 0; using (XtractDataContext db = new XtractDataContext()) { db.ObjectTrackingEnabled = false; var tweets = from tweet in db.Tweets where tweet.sample_reason.Equals(SampleReason.user_data.ToString()) select tweet; foreach (Tweet tweet in tweets) { LanguageModel smallModel = new LanguageModel(tweet.text); double similarity = smallModel.Similarity(english); Debug.Assert(similarity<=1.0d, "Similarity should never be more than 1"); db.SetTweetEnglishSimilarity(tweet.twitter_id, similarity); if (++count%100 == 0) { Console.Out.WriteLine("{0}:{1}:'{2}'", tweet.twitter_id, similarity, tweet.text); Console.Out.WriteLine("Updated similarity for " + count + " tweets"); } } Console.Out.WriteLine("Updated similarity for " + count + " tweets"); } }
public void Execute() { int count = 0; WebResponseBuilder responseBuilder = new WebResponseBuilder(); string twitter_api_username = ConfigurationManager.AppSettings["twitter_user"]; string twitter_api_password = ConfigurationManager.AppSettings["twitter_pass"]; responseBuilder.UseCGICredentials(twitter_api_username, twitter_api_password); TwitterStreamStatusProvider provider = new TwitterStreamStatusProvider(responseBuilder); provider.YieldThisMany = NUM_STATUSES_TO_PULL; Console.Out.WriteLine("Parsing english model trigrams from source data in 'english_data' directory"); EnglishStatusProvider englishProvider = new EnglishStatusProvider(provider, "english_data"); englishProvider.Threshold = ENGLISH_THRESHOLD; Console.Out.WriteLine("About to start reading from twitter - up to " + NUM_STATUSES_TO_PULL + " statuses."); XtractDataContext db = new XtractDataContext(); foreach (TwitterStatus status in englishProvider.GetMessages()) { string screen_name = status.user.screen_name; var existing = db.Twusers.Where(u => u.screen_name == screen_name); if (existing.Count() == 0) { Twuser user = Twuser.From(status.user); user.english_similarity = status.english_similarity; db.Twusers.InsertOnSubmit(user); } else { Twuser user = existing.First(); if (user.english_similarity < status.english_similarity) { user.english_similarity = status.english_similarity; } } Tweet tweet = new Tweet(); tweet.english_similarity = status.english_similarity; tweet.screen_name = status.user.screen_name; tweet.text = status.text; tweet.twitter_id = status.id; tweet.sample_reason = SampleReason.sample_stream.ToString(); db.Tweets.InsertOnSubmit(tweet); db.SubmitChanges(); if (count++ > UPDATE_EVERY) { Console.Out.WriteLine("Wrote " + count + " tweets."); } } }
public void TestTweetSerializeFromDB() { //setup IUrlExpander expander = new UrlExpander(); Tokenizer tokenizer = new Tokenizer(expander); using (XtractDataContext db = new XtractDataContext()) { db.ObjectTrackingEnabled = false; foreach (Tweet tw in (from tweet in db.Tweets where tweet.sample_reason.Equals(SampleReason.user_data.ToString()) select tweet).Take(10)) { Console.Out.WriteLine("JSON:" + JSON.Serialize(tw)); } } }
private bool CheckForExisting(string screen_name) { using (XtractDataContext dbWrite = new XtractDataContext()) { var existingCount = (from tweet in dbWrite.Tweets where tweet.screen_name.Equals(screen_name) select tweet).Count(); if (existingCount > 5) { Console.Out.WriteLine("Skipping retrieve for @" + screen_name + " as we already have " + existingCount + " rows of data"); var user = (from twuser in dbWrite.Twusers where twuser.screen_name.Equals(screen_name) select twuser).First(); DownloadStatus lastStatus = DownloadStatus.DataDownloaded; user.last_parse_status = lastStatus.ToString(); dbWrite.SubmitChanges(); return true; } } return false; }
public void Execute() { int overallCount = 0; using (XtractDataContext dbRead = new XtractDataContext()) { dbRead.ObjectTrackingEnabled = false; foreach (Twuser user in from twuser in dbRead.Twusers where ((twuser.id % 10 == 0) && (twuser.last_parse_status==null)) orderby twuser.english_similarity descending select twuser) { string screen_name = user.screen_name; if (CheckForExisting(screen_name)) continue; OAuthTwitterResponseBuilder oAuthTwitter = new OAuthTwitterResponseBuilder(); UserStatusProvider provider = new UserStatusProvider(oAuthTwitter, user.screen_name); Console.Out.WriteLine("About to request data for @" + user.screen_name); DateTime nowish = DateTime.UtcNow; int count = 0; using (XtractDataContext dbWrite = new XtractDataContext()) { foreach (TwitterStatus status in provider.GetMessages()) { long twitter_id = status.id.Value; var existing = dbWrite.Tweets.Where(tw => tw.twitter_id == twitter_id); if (existing.Count() != 0) continue; Tweet tweet = new Tweet(); //tweet.english_similarity = status.english_similarity; tweet.screen_name = status.user.screen_name; tweet.text = status.text; tweet.twitter_id = status.id; DateTime createdAt = DateUtils.UTCDateTimeFromTwitterTimeStamp(status.created_at); tweet.date_tweeted = DateUtils.ISO8601TimeStampFromUTCDateTime(createdAt); tweet.date_scanned = DateUtils.ISO8601TimeStampFromUTCDateTime(nowish); tweet.sample_reason = SampleReason.user_data.ToString(); dbWrite.Tweets.InsertOnSubmit(tweet); count++; overallCount++; if (overallCount % 100 == 0) { dbWrite.SubmitChanges(); Console.Out.WriteLine(overallCount + " tweets saved so far"); } } } Console.Out.WriteLine(count + " tweets found for " + screen_name); using (XtractDataContext dbWrite = new XtractDataContext()) { var lastUser = (from twuser in dbWrite.Twusers where twuser.screen_name.Equals(screen_name) select twuser).First(); DownloadStatus lastStatus = (count==0)? DownloadStatus.NoDataFound : DownloadStatus.DataDownloaded; lastUser.last_parse_status = lastStatus.ToString(); dbWrite.SubmitChanges(); } } } }