Ejemplo n.º 1
0
        private void initModelFromRepo()
        {
            string msg;

            DateTime endt   = DateTime.UtcNow;
            DateTime startt = DateTime.UtcNow.AddHours(-1 * MaxHoursConsidered);

            //DateTime startt = DateTime.UtcNow.AddHours(-1 * 36);    // for testing

            msg = String.Format("initModelFromRepo(): Building Initial Model for Articles in Vertical {0} \n\t from startTime [{1}] to endTime [{2}].",
                                VerticalConstants.GetFriendlyName(VerticalId), startt.ToString(), endt.ToString());
            logger.Info(msg);

            logger.Debug("Retrieving Articles from Pickscomb repository.");
            List <Article> articles = null;

            try
            {
                IPickscombRepository repo = PickscombRepository.Instance;
                logger.Info("PickscombRepository instantiated. Fetching Articles . . .");
                articles = repo.GetArticles(VerticalId, startt, endt).ToList <Article>();
            }
            catch (Exception ex)
            {
                logger.Error("VerticalCitationStore::initModelFromRepo() failed to fetch Articles from Pickscomb repository.");
                logger.Error(ex.Message);
                throw new Exception("VerticalCitationStore::initModelFromRepo() failed to fetch Articles from Pickscomb repository.");
            }

            msg = String.Format("Retrieved {0} Articles from the repository. Building the initial model . . .", articles.Count());
            logger.Info(msg);
            uint count = 0;

            foreach (Article art in articles)
            {
                // Test function.
//                testArticleURLs(art);
                msg = string.Format("Adding #[{0}]", ++count);
                logger.Info(msg);
                AddCitationsInArticle(art);
                //logger.Info("ADDED");
            }
            logger.Info("Initial CitationStore Model build complete.\n\n");

//            testCitationStore();
        }
        public void rankOnCitationPopularity(uint vIndex, uint numRankedArticlesToExtract)
        {
            NPreFetched      = 0;
            dataSetV[vIndex] = false;

            Guid vid = VerticalConstants.AllVerticals[vIndex];

            IPickscombRepository repo = PickscombRepository.Instance;

            /* // Do not fetch all articles from repo; too expensive.
             * // Fetch from database Articles going back MaxHoursConsidered ; default MaxHoursConsidered = 5 days * 24
             * uint numHours = CitationStore.Instance.CStore[vid].MaxHoursConsidered;
             * DateTime start = DateTime.Now.AddHours(-1 * numHours);
             * List<Article> articles = repo.GetArticles( vid, start, start.AddHours(numHours)).ToList<Article>();
             * */

            Guid        artid;
            List <Guid> articlesWithCitations = new List <Guid>();
            uint        ranking = 0, inDBCount = 0;

            CitationStore.Instance.CStore[vid].setTimeWeightedCitationPopularity();
            Dictionary <string, double> rankedURLs = CitationStore.Instance.CStore[vid].getTimeWeightedCitationPopularityScores();

            // make sure the URLs are sorted descending according to ranking score
            rankedURLs.OrderByDescending(y => y.Value).ToDictionary(y => y.Key, y => y.Value);

            foreach (KeyValuePair <string, double> url in rankedURLs)
            {
                ++ranking;

                if (CitationStore.RepoArticleUrlGuidIndex[vid].ContainsKey(url.Key))
                {
                    // Article for this URL is in DB; map to Guid
                    ++inDBCount;
                    artid = CitationStore.RepoArticleUrlGuidIndex[vid][url.Key];
                    articlesWithCitations.Add(artid);

                    _allRankedArticleMetrics[vIndex].Add(url.Value, artid);

                    // Log only the top 100
                    if (inDBCount < 100)
                    {
                        logger.DebugFormat(" Found Article Scored [{0}] in DB with Guid [{1}] Ranking [{2}]",
                                           url.Value, artid, inDBCount);
                    }

                    /*
                     * // Fetch and update in repo scores of only those Articles to be indexed; others scored at 0.0.
                     * if (inDBCount <= numRankedArticlesToExtract)
                     * {
                     *  Article art = repo.GetArticle(vid, artid);
                     *  art.Score = url.Value;
                     *  art.PopularityScore = url.Value;
                     *  art.PopularityRanking = ranking;
                     *  repo.Save(art);
                     *  logger.DebugFormat("[#{0}] Updated DB Article Score to [{1:F20}].", inDBCount, art.Score);
                     * }
                     */
                }
                else //( !articleInDB )
                {
                    _rankedURLNotInDB[vIndex].Add(url.Key, new RankingMetrics(ranking, url.Value));
                }
            }

            logger.Debug("\n");
            logger.DebugFormat("~~~ Found [{0}] Cited Articles in DB.", inDBCount);
            logger.DebugFormat("~~~ Found [{0}] Total Cited Article URLs.", ranking);
            if (ranking > 0)
            {
                logger.DebugFormat("~~~ [{0}] ({1:F4}%) Citated URL's do not have Articles in the database.\n\n",
                                   ranking - inDBCount, (ranking - inDBCount) * 100 / ranking);
            }

            /* Don't fetch from repo here. Fetch operation too expensive.
             * Now add the Articles in the DB within the time frame but with no citations, to the
             * bottom if the list with a lower score than the lowest citation score.
             * double noCitationScore = lowestScore / 10;
             ++ranking;
             * foreach (Article art in articles)
             * {
             *  if ( !articlesWithCitations.Contains( art.GetArticleId() ) )
             *  {
             *      // Add this article with a score lower than the lowest score.
             *      // They all have the same ranking and the same lowest score at the bottom.
             *      art.PopularityRanking = ranking;
             *      art.PopularityScore = noCitationScore;
             *      repo.Save(art);
             *
             *      _allRankedArticleMetrics[vIndex].Add(art.PopularityScore, art.GetArticleId());
             *      if (++inDBCount <= NPreFetched)
             *      {
             *          _topNRankedArticles[vIndex].Add(art.PopularityScore, art);
             *      }
             *  }
             * }
             * */

            _averagePopularityScore[vIndex] = CitationStore.Instance.CStore[vid].AveragePopularityScore;
            dataSetV[vIndex] = true;
        }