/*-------------------------------------------------------------------------- * Functions for the CITATIONS Table. * Table Format: * key: ArticleGuid or other unique string ID * value: "ArticleCreateTimestamp|citation1|citation2|..." */ // This is the function the Crawler will call every time it adds a new Article // to the database, when the Crawler adds it, sumultaneously populating the // CITATONS table for that Article. public static void WriteArticleRankingDataToDb(RankingType_T rankingType, Grouping verticalId, string articleId, // Uniquely identifies the Article within the Vertical DateTimeOffset articleCreationTime, string articleUrl, string articleText) { string key = articleId; string value = ""; if (rankingType == RankingType_T.Popularity) { value = RankingDataProcessor.GetCitationsDbString(articleCreationTime, articleUrl, articleText); } else { StringBuilder sb = new StringBuilder(articleCreationTime.ToString()); sb.Append(RankingDataProcessor.Separator).Append(articleUrl); sb.Append(RankingDataProcessor.Separator).Append(articleText); value = sb.ToString(); } if (!string.IsNullOrEmpty(value)) { // Write <key, value> to database CITATIONS table for Vertical } }
public void LoadCitationsModel(RankingType_T rankingType, Dictionary <string, string> dbStr, DateTimeOffset to_time) { KeyValuePair <DateTimeOffset, List <string> > cit; DateTimeOffset modelTimeLimit = to_time.AddHours(-1 * Ranker.RankingHoursConsidered); string matchingUrl; foreach (KeyValuePair <string, string> rec in dbStr) { _PrlOpt.CancellationToken.ThrowIfCancellationRequested(); cit = RankingDataProcessor.GetCitationsFromDbString(rankingType, rec.Value); // Add the title Article's URL to the Index matchingUrl = RankingDataProcessor.GetMatchingUrl(cit.Value[0]); ArtcileIndexInfo idxval = new ArtcileIndexInfo(); idxval.ArticleId = rec.Key; idxval.ArtcileDatetime = cit.Key; try { if (!string.IsNullOrEmpty(matchingUrl)) { ArticleUrlIdIndex[matchingUrl] = idxval; } } catch (Exception) // This Article is already in Index { Log.Warn("Ranker init(): Error adding Article for matching URL [{0}] to Index for Article: {1}", matchingUrl, rec.Key.ToString()); } // Build the citation model. if (idxval.ArtcileDatetime >= modelTimeLimit) { try { AddCitationsInArticle(cit.Key, cit.Value); } catch (Exception) { Log.Warn("Adding Citations in Article with ID [{0}] failed, URL: {1}", rec.Key.ToString(), rec.Value[0]); //throw; } } } }
/// <summary> /// Add all the citations in the Article to the Citations Model. /// </summary> /// <param name="timestamp">timestamp of the parent Article</param> /// <param name="citations">the list of citations in the Article</param> private void AddCitationsInArticle(DateTimeOffset timestamp, List <string> citations) { // leave out for i=0; this is the parent Article's URL for (int i = 1; i < citations.Count; ++i) { _PrlOpt.CancellationToken.ThrowIfCancellationRequested(); string matchingUrl = RankingDataProcessor.GetMatchingUrl(citations[i]); if (!string.IsNullOrEmpty(matchingUrl)) { try { AddCitation(matchingUrl, timestamp); } catch (Exception ex) { Log.Warn("AddCitation() failed at timestamp[{0}] for URL [{1}]", timestamp.ToString("yyyy/MM/dd HH:mm:ss"), citations[i]); } } } }
/// <summary> /// Converts and returns the citations from the database string that stores the citations /// in the CITATIONS table. /// </summary> /// <param name="dbStr">The databse value string from the CITATIONS table</param> /// <returns>The list of citations against the time they were sited.</returns> /// public static KeyValuePair <DateTimeOffset, List <string> > GetCitationsFromDbString( RankingType_T rankingType, string dbStr) { char[] delimiters = { Separator }; string[] parts = dbStr.Split(delimiters); DateTimeOffset dt = DateTimeOffset.Parse(parts[0]); List <string> urls = new List <string>(); if (rankingType == RankingType_T.Popularity) { for (int i = 1; i < parts.Length; ++i) { urls.Add(parts[i]); } } else if (rankingType == RankingType_T.PopularitySimilarity) { urls.Add(parts[1]); // this is the title Article's URL // get citations from raw text List <string> citations = RankingDataProcessor.GetCitations(parts[2], true); for (int i = 0; i < citations.Count; ++i) { urls.Add(citations[i]); } } KeyValuePair <DateTimeOffset, List <string> > ct = new KeyValuePair <DateTimeOffset, List <string> >(dt, urls); return(ct); }