Пример #1
0
        static void Main(string[] args)
        {
            //test();
            //return;

            Console.WriteLine(Name);

            TweetClusterWorker clusterWorker = new TweetClusterWorker();
            
            clusterWorker.CreateHashTables();
            Console.WriteLine("Created hash tables");
            clusterWorker.InitializeWithOldTweets();

            DateTime nextRehash = DateTime.Now.AddHours(_rehashIntervalHours);
            int batchesBeforeStoryProcessing = 5;
            while (true)
            {
                DateTime start = DateTime.Now;

                //Calculate tweet relations and TweetClusterIDs
                int clusterBatchCount = 0;
                int processedCount = 0;
                do
                {
                    processedCount = clusterWorker.ProcessTweetBatch();
                    Console.WriteLine("Calculated relations");
                } while (processedCount > 10 && clusterBatchCount++ < batchesBeforeStoryProcessing);

                StoryWorker.ApplyPendingStorySplits();
                StoryWorker.ApplyPendingStoryMerges();
                StoryWorker.Run();

                //Update hash tables
                if (DateTime.Now > nextRehash)
                {
                    Console.WriteLine("Cleaning deleted tweets from hashtables");
                    clusterWorker.CleanDeletedOrArchivedTweets();
                    Console.WriteLine("Rehashing");
                    clusterWorker.UpdateOldestHashFunction();
                    nextRehash = nextRehash.AddHours(_rehashIntervalHours);
                    Console.WriteLine();
                }

                //Wait for up to 30 seconds
                int runtime = (int)(DateTime.Now - start).TotalMilliseconds;
                if (clusterBatchCount < batchesBeforeStoryProcessing && runtime < 30000)
                {
                    Console.WriteLine("Waiting");
                    Thread.Sleep(30000 - runtime);
                }

            }
        }
Пример #2
0
        static void Main(string[] args)
        {
            //test();
            //return;

            Console.WriteLine(Name);

            TweetClusterWorker clusterWorker = new TweetClusterWorker();

            clusterWorker.CreateHashTables();
            Console.WriteLine("Created hash tables");
            clusterWorker.InitializeWithOldTweets();

            DateTime nextRehash = DateTime.Now.AddHours(_rehashIntervalHours);
            int      batchesBeforeStoryProcessing = 5;

            while (true)
            {
                DateTime start = DateTime.Now;

                //Calculate tweet relations and TweetClusterIDs
                int clusterBatchCount = 0;
                int processedCount    = 0;
                do
                {
                    processedCount = clusterWorker.ProcessTweetBatch();
                    Console.WriteLine("Calculated relations");
                } while (processedCount > 10 && clusterBatchCount++ < batchesBeforeStoryProcessing);

                StoryWorker.ApplyPendingStorySplits();
                StoryWorker.ApplyPendingStoryMerges();
                StoryWorker.Run();

                //Update hash tables
                if (DateTime.Now > nextRehash)
                {
                    Console.WriteLine("Cleaning deleted tweets from hashtables");
                    clusterWorker.CleanDeletedOrArchivedTweets();
                    Console.WriteLine("Rehashing");
                    clusterWorker.UpdateOldestHashFunction();
                    nextRehash = nextRehash.AddHours(_rehashIntervalHours);
                    Console.WriteLine();
                }

                //Wait for up to 30 seconds
                int runtime = (int)(DateTime.Now - start).TotalMilliseconds;
                if (clusterBatchCount < batchesBeforeStoryProcessing && runtime < 30000)
                {
                    Console.WriteLine("Waiting");
                    Thread.Sleep(30000 - runtime);
                }
            }
        }
Пример #3
0
        static void Main(string[] args)
        {
            Console.WriteLine(Name);

            TweetClusterWorker neighborFinder = new TweetClusterWorker();

            neighborFinder.CreateHashTables();
            Console.WriteLine("Created hash tables");
            neighborFinder.InitializeWithOldTweets();

            DateTime nextRehash = DateTime.Now.AddHours(_rehashIntervalHours);

            while (true)
            {
                DateTime start = DateTime.Now;

                //Calculate tweet relations and TweetClusterIDs
                int clusterBatchCount = 0;
                int processedCount    = 0;
                do
                {
                    processedCount = neighborFinder.ProcessTweetBatch();

                    //These two are run here as they affect the percieved responsiveness of the front-end
                    StoryWorker.ApplyPendingStorySplits();
                    StoryWorker.ApplyPendingStoryMerges();
                } while (processedCount > 10 && clusterBatchCount++ < 10);
                Console.WriteLine("Calculated relations");

                //Perform agglomerative grouping of clusters into stories
                StoryWorker.Run();

                //Update hash tables
                if (DateTime.Now > nextRehash)
                {
                    Console.WriteLine("Cleaning deleted tweets from hashtables");
                    neighborFinder.CleanDeletedTweets();
                    Console.WriteLine("Rehashing");
                    neighborFinder.UpdateOldestHashFunction();
                    nextRehash = nextRehash.AddHours(_rehashIntervalHours);
                    Console.WriteLine();
                }

                //Wait for up to 30 seconds
                int runtime = (int)(DateTime.Now - start).TotalMilliseconds;
                if (runtime < 30000)
                {
                    Console.WriteLine("Waiting");
                    Thread.Sleep(30000 - runtime);
                }
            }
        }
Пример #4
0
        static void Main(string[] args)
        {
            Console.WriteLine(Name);

            TweetClusterWorker neighborFinder = new TweetClusterWorker();

            neighborFinder.CreateHashTables();
            Console.WriteLine("Created hash tables");
            neighborFinder.InitializeWithOldTweets();

            DateTime nextRehash = DateTime.Now.AddHours(_rehashIntervalHours);
            while (true)
            {
                DateTime start = DateTime.Now;

                //Calculate tweet relations and TweetClusterIDs
                int clusterBatchCount = 0;
                int processedCount = 0;
                do
                {
                    processedCount = neighborFinder.ProcessTweetBatch();

                    //These two are run here as they affect the percieved responsiveness of the front-end
                    StoryWorker.ApplyPendingStorySplits();
                    StoryWorker.ApplyPendingStoryMerges();
                } while (processedCount > 10 && clusterBatchCount++ < 10);
                Console.WriteLine("Calculated relations");

                //Perform agglomerative grouping of clusters into stories
                StoryWorker.Run();

                //Update hash tables
                if (DateTime.Now > nextRehash)
                {
                    Console.WriteLine("Cleaning deleted tweets from hashtables");
                    neighborFinder.CleanDeletedTweets();
                    Console.WriteLine("Rehashing");
                    neighborFinder.UpdateOldestHashFunction();
                    nextRehash = nextRehash.AddHours(_rehashIntervalHours);
                    Console.WriteLine();
                }

                //Wait for up to 30 seconds
                int runtime = (int)(DateTime.Now - start).TotalMilliseconds;
                if (runtime < 30000)
                {
                    Console.WriteLine("Waiting");
                    Thread.Sleep(30000 - runtime);
                }

            }
        }
Пример #5
0
        static void Main(string[] args)
        {
            string u = Settings.FilterStreamConsumer_Username;
            Test();

            try
            {
                Output.Print(_name, "Starting batch processing...");

                //Interval
                long startAtID = 143479600000000000;
                long stopAtID = 149276747383320578; //stop at the first tweet of the 2011-12-21 file
                if (args != null && args.Length == 2)
                {
                    long.TryParse(args[0], out startAtID);
                    long.TryParse(args[1], out stopAtID);
                }

                //Initialize
                long lastProcessedID = startAtID;
                long processedCount = 0;
                DateTime lastRehashTime = new DateTime();
                DateTime lastMaintenanceTime = new DateTime();
                bool initializedNeighborFinder = false;
                InitializeRandomStreamFileNames();

                TweetClusterWorker neighborFinder = new TweetClusterWorker();

                //Process tweets
                while (lastProcessedID < stopAtID)
                {
                    //Get batch of tweets + corresponding batch of random stream words
                    Dictionary<long, Tweet> tweets = GetTweetBatch(lastProcessedID, 400); //Was 200
                    Console.WriteLine("Processing " + tweets.Count + " tweets");
                    processedCount += tweets.Count;

                    //Extract words from tweets
                    Dictionary<string, int> wordCounts = new Dictionary<string, int>();
                    foreach (Tweet t in tweets.Values)
                    {
                        foreach (string word in t.Words)
                        {
                            if (wordCounts.ContainsKey(word))
                                wordCounts[word]++;
                            else
                                wordCounts.Add(word, 1);
                        }
                    }
                    int sampleStreamTweetCount = AddSampleStreamWords(wordCounts, tweets.Keys.Max(), 2000); //Was 2000
                    Console.WriteLine("Calculated word counts (" + sampleStreamTweetCount + " tweets from sample stream)");

                    //Update DB word stats
                    Dictionary<string, Word> words = InsertWords(wordCounts);
                    Console.WriteLine("Inserted " + words.Count + " words");

                    //Insert into Tweet and WordTweet
                    InsertToTweetAndWordTweet(tweets, words);
                    Console.WriteLine("Inserted tweets");

                    DateTime lastTweetTime = tweets.Values.Max(n => n.CreatedAt);
                    if (processedCount > 2000) // && (lastTweetTime - lastStoryTime).Minutes > 9)
                    {
                        if (!initializedNeighborFinder)
                        {
                            neighborFinder.CreateHashTables();
                            neighborFinder.InitializeWithOldTweets();
                            initializedNeighborFinder = true;
                        }

                        //Calculate tweet relations and TweetClusterIDs
                        while (neighborFinder.ProcessTweetBatch() > 0)
                            ;
                        Console.WriteLine("Calculated relations");

                        //Refine tweet clusters
                        //TweetClusterWorker.Run();

                        //Perform agglomerative grouping of clusters into stories
                        StoryWorker.Run();

                        //lastStoryTime = lastTweetTime;
                    }

                    //Perform maintenance of word stats
                    if ((lastTweetTime - lastMaintenanceTime).Minutes > 10)
                    {
                        Helpers.RunSqlStatement(_name, "update WordScore set Score1h = Score1h * 0.890899, Score4d = Score4d * 0.99879734;");
                        Helpers.RunSqlStatement(_name, "delete wt.* from WordScore ws, WordTweet as wt where ws.WordID = wt.WordID and (score4d < 50 and score1h < 0.5);");
                        Helpers.RunSqlStatement(_name, "delete w.*, ws.* from WordScore ws, Word as w where ws.WordID = w.WordID and (score4d < 50 and score1h < 0.5);");
                        Helpers.RunSqlStatement(_name, "update Constants set value = (select 0.1 * max(score4d) from WordScore) where name = 'WordScore4dHigh';");
                        Helpers.RunSqlStatement(_name,
                            @"delete r.* from TweetRelation r natural join 
                            (
                                select TweetID1, TweetID2, tc1.IsArchived or tc2.IsArchived as IsArchived
                                from TweetRelation r
                                join Tweet t1 on t1.TweetID=r.TweetID1
                                join TweetCluster tc1 on tc1.TweetClusterID=t1.TweetClusterID
                                join Tweet t2 on t2.TweetID=r.TweetID2
                                join TweetCluster tc2 on tc2.TweetClusterID=t2.TweetClusterID
                            ) T
                            where IsArchived;");
                        lastMaintenanceTime = lastTweetTime;
                        Console.WriteLine("Performed maintenance");
                    }

                    //Update hashtables
                    if (initializedNeighborFinder && (lastTweetTime - lastRehashTime).Hours > 4)
                    {
                        neighborFinder.UpdateOldestHashFunction();
                        lastRehashTime = lastTweetTime;
                        Console.WriteLine("Recalculated a hash function");
                    }

                    lastProcessedID = tweets.Keys.Max();

                    //Console.ReadLine();
                }

                //Clean up
                if (_sampleStreamReader != null)
                {
                    _sampleStreamReader.Close();
                    _sampleStreamReader.Dispose();
                }

                Output.Print(_name, "Finished batch processing.");
            }
            catch (Exception e)
            {
                Output.Print(_name, e);
            }
        }