static void Main(string[] args) { //test(); //return; Console.WriteLine(Name); TweetClusterWorker clusterWorker = new TweetClusterWorker(); clusterWorker.CreateHashTables(); Console.WriteLine("Created hash tables"); clusterWorker.InitializeWithOldTweets(); DateTime nextRehash = DateTime.Now.AddHours(_rehashIntervalHours); int batchesBeforeStoryProcessing = 5; while (true) { DateTime start = DateTime.Now; //Calculate tweet relations and TweetClusterIDs int clusterBatchCount = 0; int processedCount = 0; do { processedCount = clusterWorker.ProcessTweetBatch(); Console.WriteLine("Calculated relations"); } while (processedCount > 10 && clusterBatchCount++ < batchesBeforeStoryProcessing); StoryWorker.ApplyPendingStorySplits(); StoryWorker.ApplyPendingStoryMerges(); StoryWorker.Run(); //Update hash tables if (DateTime.Now > nextRehash) { Console.WriteLine("Cleaning deleted tweets from hashtables"); clusterWorker.CleanDeletedOrArchivedTweets(); Console.WriteLine("Rehashing"); clusterWorker.UpdateOldestHashFunction(); nextRehash = nextRehash.AddHours(_rehashIntervalHours); Console.WriteLine(); } //Wait for up to 30 seconds int runtime = (int)(DateTime.Now - start).TotalMilliseconds; if (clusterBatchCount < batchesBeforeStoryProcessing && runtime < 30000) { Console.WriteLine("Waiting"); Thread.Sleep(30000 - runtime); } } }
static void Main(string[] args) { Console.WriteLine(Name); TweetClusterWorker neighborFinder = new TweetClusterWorker(); neighborFinder.CreateHashTables(); Console.WriteLine("Created hash tables"); neighborFinder.InitializeWithOldTweets(); DateTime nextRehash = DateTime.Now.AddHours(_rehashIntervalHours); while (true) { DateTime start = DateTime.Now; //Calculate tweet relations and TweetClusterIDs int clusterBatchCount = 0; int processedCount = 0; do { processedCount = neighborFinder.ProcessTweetBatch(); //These two are run here as they affect the percieved responsiveness of the front-end StoryWorker.ApplyPendingStorySplits(); StoryWorker.ApplyPendingStoryMerges(); } while (processedCount > 10 && clusterBatchCount++ < 10); Console.WriteLine("Calculated relations"); //Perform agglomerative grouping of clusters into stories StoryWorker.Run(); //Update hash tables if (DateTime.Now > nextRehash) { Console.WriteLine("Cleaning deleted tweets from hashtables"); neighborFinder.CleanDeletedTweets(); Console.WriteLine("Rehashing"); neighborFinder.UpdateOldestHashFunction(); nextRehash = nextRehash.AddHours(_rehashIntervalHours); Console.WriteLine(); } //Wait for up to 30 seconds int runtime = (int)(DateTime.Now - start).TotalMilliseconds; if (runtime < 30000) { Console.WriteLine("Waiting"); Thread.Sleep(30000 - runtime); } } }
static void Main(string[] args) { string u = Settings.FilterStreamConsumer_Username; Test(); try { Output.Print(_name, "Starting batch processing..."); //Interval long startAtID = 143479600000000000; long stopAtID = 149276747383320578; //stop at the first tweet of the 2011-12-21 file if (args != null && args.Length == 2) { long.TryParse(args[0], out startAtID); long.TryParse(args[1], out stopAtID); } //Initialize long lastProcessedID = startAtID; long processedCount = 0; DateTime lastRehashTime = new DateTime(); DateTime lastMaintenanceTime = new DateTime(); bool initializedNeighborFinder = false; InitializeRandomStreamFileNames(); TweetClusterWorker neighborFinder = new TweetClusterWorker(); //Process tweets while (lastProcessedID < stopAtID) { //Get batch of tweets + corresponding batch of random stream words Dictionary<long, Tweet> tweets = GetTweetBatch(lastProcessedID, 400); //Was 200 Console.WriteLine("Processing " + tweets.Count + " tweets"); processedCount += tweets.Count; //Extract words from tweets Dictionary<string, int> wordCounts = new Dictionary<string, int>(); foreach (Tweet t in tweets.Values) { foreach (string word in t.Words) { if (wordCounts.ContainsKey(word)) wordCounts[word]++; else wordCounts.Add(word, 1); } } int sampleStreamTweetCount = AddSampleStreamWords(wordCounts, tweets.Keys.Max(), 2000); //Was 2000 Console.WriteLine("Calculated word counts (" + sampleStreamTweetCount + " tweets from sample stream)"); //Update DB word stats Dictionary<string, Word> words = InsertWords(wordCounts); Console.WriteLine("Inserted " + words.Count + " words"); //Insert into Tweet and WordTweet InsertToTweetAndWordTweet(tweets, words); Console.WriteLine("Inserted tweets"); DateTime lastTweetTime = tweets.Values.Max(n => n.CreatedAt); if (processedCount > 2000) // && (lastTweetTime - lastStoryTime).Minutes > 9) { if (!initializedNeighborFinder) { neighborFinder.CreateHashTables(); neighborFinder.InitializeWithOldTweets(); initializedNeighborFinder = true; } //Calculate tweet relations and TweetClusterIDs while (neighborFinder.ProcessTweetBatch() > 0) ; Console.WriteLine("Calculated relations"); //Refine tweet clusters //TweetClusterWorker.Run(); //Perform agglomerative grouping of clusters into stories StoryWorker.Run(); //lastStoryTime = lastTweetTime; } //Perform maintenance of word stats if ((lastTweetTime - lastMaintenanceTime).Minutes > 10) { Helpers.RunSqlStatement(_name, "update WordScore set Score1h = Score1h * 0.890899, Score4d = Score4d * 0.99879734;"); Helpers.RunSqlStatement(_name, "delete wt.* from WordScore ws, WordTweet as wt where ws.WordID = wt.WordID and (score4d < 50 and score1h < 0.5);"); Helpers.RunSqlStatement(_name, "delete w.*, ws.* from WordScore ws, Word as w where ws.WordID = w.WordID and (score4d < 50 and score1h < 0.5);"); Helpers.RunSqlStatement(_name, "update Constants set value = (select 0.1 * max(score4d) from WordScore) where name = 'WordScore4dHigh';"); Helpers.RunSqlStatement(_name, @"delete r.* from TweetRelation r natural join ( select TweetID1, TweetID2, tc1.IsArchived or tc2.IsArchived as IsArchived from TweetRelation r join Tweet t1 on t1.TweetID=r.TweetID1 join TweetCluster tc1 on tc1.TweetClusterID=t1.TweetClusterID join Tweet t2 on t2.TweetID=r.TweetID2 join TweetCluster tc2 on tc2.TweetClusterID=t2.TweetClusterID ) T where IsArchived;"); lastMaintenanceTime = lastTweetTime; Console.WriteLine("Performed maintenance"); } //Update hashtables if (initializedNeighborFinder && (lastTweetTime - lastRehashTime).Hours > 4) { neighborFinder.UpdateOldestHashFunction(); lastRehashTime = lastTweetTime; Console.WriteLine("Recalculated a hash function"); } lastProcessedID = tweets.Keys.Max(); //Console.ReadLine(); } //Clean up if (_sampleStreamReader != null) { _sampleStreamReader.Close(); _sampleStreamReader.Dispose(); } Output.Print(_name, "Finished batch processing."); } catch (Exception e) { Output.Print(_name, e); } }