Пример #1
0
        private void RunSimilarityModel(CitationPopularityModel citationModel,
                                        SimilarityRanker similarityModel)
        {
            Dictionary <string, double> scoredArticles = citationModel.PopularityScoredArticles;

            SortedDictionary <double, PopularGroup> result =
                new SortedDictionary <double, PopularGroup>(new DescDuplicateDoubleComp());

            while (scoredArticles.Count() > similarityModel.MaxNumSimilar)
            {
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                KeyValuePair <string, double> thisArt = getMaxVal(scoredArticles);

                var similarArt = similarityModel.GetTopSimilarArticlesTo(thisArt.Key);

                PopularGroup pop = new PopularGroup();
                pop.TitleArticle = thisArt.Key;
                foreach (KeyValuePair <double, string> p in similarArt)
                {
                    _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                    try { pop.SimilarArticles[p.Key] = p.Value; }
                    catch (Exception) { }

                    try { scoredArticles.Remove(p.Value); }
                    catch (Exception) { }
                }
                result[thisArt.Value] = pop;

                // remove already ranked articles
                try { scoredArticles.Remove(thisArt.Key); }
                catch (Exception) { }
            }

            // The few left overs
            foreach (string art in scoredArticles.Keys)
            {
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                PopularGroup pop = new PopularGroup();
                pop.TitleArticle            = art;
                result[scoredArticles[art]] = pop;
            }

            // Write the fully ranked cluster results to the database
            RankerDbAccess.WritePopularitySimilarityRankedClustersToDb(
                VerticalId, result, _cToken);
        }
Пример #2
0
        /// <summary>
        /// This is the Run() function, the sole entry point, to start the Ranker.
        /// NOTE: You do nto start the Ranker every 15 minutes.
        /// You start the Ranker only once, and it runs forever.
        /// You restart only if it crashes or if the server re-starts
        /// </summary>
        /// <param name="cancellationToken"></param>
        public void Run(CancellationTokenSource cancellationToken)
        {
            _cToken = cancellationToken;
            _PrlOpt = new ParallelOptions()
            {
                MaxDegreeOfParallelism = System.Environment.ProcessorCount,
                CancellationToken      = cancellationToken.Token
            };

            Log.Info("---------------------------------------------------------------------\n"
                     + "Ranker started for vertical {0}.\n", VerticalId.ToString());

            CitationPopularityModel citationModel = CitationPopularityModel.Instance;

            Log.Info("Citation Model instantiated.");

            SimilarityRanker similarityModel = SimilarityRanker.Instance;

            if (_rankingType == RankingType_T.PopularitySimilarity)
            {
                Log.Info("TF-IDF Similarity Ranker instantiated.");
            }


            DateTimeOffset toTime     = DateTimeOffset.Now;
            uint           readPeriod = Ranker.RankingHoursConsidered + Ranker.RankingHoursBuffer;
            DateTimeOffset fromTime   = toTime.AddHours(-1 * readPeriod);

            // Get data from fromTime to Now from the CITATIONS table
            Dictionary <string, string> rankingData;

            try
            {
                rankingData = RankerDbAccess.ReadRankingDataFromDb(_rankingType, _verticalId,
                                                                   fromTime, toTime);
            }
            catch (Exception ex)
            {
                Log.Error("Reading CITATIONS table from the database failed.");
                throw ex;
            }
            Log.Info("Ranking Data successfully read in from the CITATIONS table.");
            _PrlOpt.CancellationToken.ThrowIfCancellationRequested();


            try
            {
                citationModel.Init(VerticalId, RankingType, rankingData, _cToken);
            }
            catch (Exception ex)
            {
                Log.Error("[FATAL]  Citation Popularity Ranker initialization failed!");
                throw ex;
            }
            Log.Info("Citation Model built for the past {0} hours.\n",
                     Ranker.RankingHoursConsidered);
            _PrlOpt.CancellationToken.ThrowIfCancellationRequested();


            // The first ranking run.
            // Estimate the top ranking Articles and their popularity scores
            try
            {
                citationModel.SetTimeWeightedCitationPopularity();
            }
            catch (Exception ex)
            {
                Log.Error("Citation Popularity Calculation failed in re-rank. ");
                throw ex;
            }
            Log.Info("Time weighted Citation Popularity set.");
            _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

            if (_rankingType == RankingType_T.Popularity)
            {
                // Write top ranking Articles and Scores to the RANKING table
                citationModel.WriteRankedArticlesToDB();
                Log.Info("Wrote the latest Citation Popularity scores to the database.");
            }


            //-----------------------------------------------------------------------
            // Similarity Ranker initial run
            if (_rankingType == RankingType_T.PopularitySimilarity)
            {
                try
                {
                    similarityModel.Init(VerticalId, rankingData, _cToken);
                }
                catch (Exception ex)
                {
                    Log.Error("[FATAL]  TF-IDF Similarity Ranker initialization failed!");
                    throw ex;
                }
                Log.Info("TF-IDF Similarity Model built for the past {0} hours.\n",
                         Ranker.RankingHoursConsidered + Ranker.RankingHoursBuffer);
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                RunSimilarityModel(citationModel, similarityModel);
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();
            }

            Log.Info("------- Ranker initialization and first run completed. -------\n\n");


            // The last time the data was retrieved and the models successfully built from it.
            _lastDbRetrievalTime = toTime;

            //----------------------------------------------------------------------
            // Ranker initialization completed.  Now run the Ranker infinite loop.
            TimeSpan processingTime, sleepTime, rankerInterval;

            rankerInterval = TimeSpan.FromMinutes(Ranker.RankerRunPeriod);

            while (true)
            {
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                processingTime = DateTimeOffset.Now - LastDbRetrievalTime;
                if (rankerInterval > processingTime)
                {
                    sleepTime = rankerInterval - processingTime;
                    Thread.Sleep(sleepTime);
                }

                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();


                // Get last interval's data from the CITATIONS table
                toTime   = DateTimeOffset.Now;
                fromTime = LastDbRetrievalTime;

                // get data from fromTime to Now from the CITATIONS table
                Log.Info("-------------------------------------------------------------------------------");
                Log.Info("New run. Retrieving Ranking data from the DB tables.");
                try
                {
                    rankingData = RankerDbAccess.ReadRankingDataFromDb(_rankingType, _verticalId,
                                                                       fromTime, toTime);
                }
                catch (Exception ex)
                {
                    Log.Error("Reading CITATIONS table from the database failed.");
                    throw ex;
                }
                _lastDbRetrievalTime = toTime;
                Log.Info("Ranking data fetched from the database.");
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                // Update the Citation Model for the last period from the data retrieved
                try
                {
                    citationModel.LoadCitationsModel(RankingType, rankingData, toTime);
                }
                catch (Exception ex)
                {
                    Log.Error("Ranker Update: LoadCitationsModel() failed.");
                    throw ex;
                }
                Log.Info("Citation Model successfully re-loaded since the last run.");
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();


                // Estimate the top ranking Articles and their popularity scores
                try
                {
                    citationModel.SetTimeWeightedCitationPopularity();
                }
                catch (Exception ex)
                {
                    Log.Error("Citation Popularity Calculation failed in re-rank. ");
                    throw ex;
                }
                Log.Info("Time weighted Citation Popularity set.");
                _PrlOpt.CancellationToken.ThrowIfCancellationRequested();

                if (_rankingType == RankingType_T.Popularity)
                {
                    // Write top ranking Articles and Scores to the RANKING table
                    citationModel.WriteRankedArticlesToDB();
                    Log.Info("Wrote the latest Citation Popularity scores to the database.");
                }

                // The front end then reads the top ranked articles directly from the RANKING table
                // RANKING table:
                // Key: Article Guid/ unique string ID
                // Value: double RankingScore in range [0.00 , 1.00]>
                //
                // The front end will then be able to retrieve the most popular articles from the
                // database using the above info and display on the front page.


                //---------------------------------------------------------------------------------
                // TF-IDF Similarity Ranking
                //

                if (RankingType == RankingType_T.PopularitySimilarity)
                {
                    try
                    {
                        similarityModel.UpdateModel(VerticalId,
                                                    rankingData,
                                                    _cToken);
                    }
                    catch (Exception)
                    {
                        Log.Error("[ERROR] TF-IDF Similarity Model initialization failed!");
                        goto CleanupCitations;
                    }
                    Log.Info("TF-IDF Similarity Model initialized successfully.");
                    _PrlOpt.CancellationToken.ThrowIfCancellationRequested();


                    RunSimilarityModel(citationModel, similarityModel);
                    _PrlOpt.CancellationToken.ThrowIfCancellationRequested();
                }


CleanupCitations:
                // cleanup and housekeeping
                citationModel.PruneCitationsIndex();
                RankerDbAccess.PruneCitationsTable(VerticalId);
                Log.Info("Housekeeping tasks completed.\n");

                Log.Info("------- Ranker rerun completed. -------\n\n");
            }
        }