public static void InitializeHistory(DatabaseConnection dbConnection) { Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent)); logger.Info("InitializeHistory", "Loading history ..."); mUrlInfo.Clear(); mTextBlockInfo.Clear(); DataTable domainsTbl = dbConnection.ExecuteQuery("select distinct domain from Documents where domain is not null"); int domainCount = 0; foreach (DataRow row in domainsTbl.Rows) { string domainName = (string)row["domain"]; DataTable urlInfoTbl = dbConnection.ExecuteQuery(string.Format("select top {0} d.id, d.corpusId, d.time, d.responseUrl, d.urlKey, d.rev, r.maxRev, tb.hashCodes from TextBlocks tb, Documents d, (select urlKey, max(rev) as maxRev from Documents group by urlKey) r where d.corpusId = tb.corpusId and d.id = tb.docId and r.urlKey = d.urlKey and d.domain = ? order by time desc", mMaxQueueSize), domainName); if (urlInfoTbl.Rows.Count == 0) { continue; } //Console.WriteLine(domainName + " " + urlInfoTbl.Rows.Count.ToString()); Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo = GetTextBlockInfo(domainName); DateTime then = DateTime.Parse((string)urlInfoTbl.Rows[0]["time"]) - new TimeSpan(mHistoryAgeDays, 0, 0, 0); domainCount++; Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName); for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--) { int maxRev = (int)urlInfoTbl.Rows[j]["maxRev"]; int rev = (int)urlInfoTbl.Rows[j]["rev"]; string urlKey = (string)urlInfoTbl.Rows[j]["urlKey"]; string timeStr = (string)urlInfoTbl.Rows[j]["time"]; Guid corpusId = new Guid((string)urlInfoTbl.Rows[j]["corpusId"]); Guid docId = new Guid((string)urlInfoTbl.Rows[j]["id"]); DateTime time = DateTime.Parse(timeStr); if (time >= then) { // URL cache if (rev == 1) { urlInfo.First.Add(urlKey, new Ref <int>(maxRev)); urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time)); } else { urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/ null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue) } // URL tree string hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodes"]; string responseUrl = (string)urlInfoTbl.Rows[j]["responseUrl"]; byte[] buffer = Convert.FromBase64String(hashCodesBase64); BinarySerializer memSer = new BinarySerializer(new MemoryStream(buffer)); ArrayList <ulong> hashCodes = new ArrayList <ulong>(memSer); bool fullPath = urlKey.Contains("?"); TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ rev == 1); textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, /*incDocCount=*/ rev == 1); textBlockInfo.Second.Enqueue(entry); } } } logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount); }
private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId, string documentId, string domainName, DateTime time, bool incDocCount) { UrlTree urlTree = textBlockInfo.First; Queue <TextBlockHistoryEntry> queue = textBlockInfo.Second; TextBlockHistoryEntry historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount); urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount); queue.Enqueue(historyEntry); }
private void RemoveItems(Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo, Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, DateTime time) { double ageDays = 0; while (urlInfo.Second.Count > mMinQueueSize && ((ageDays = (time - urlInfo.Second.Peek().mTime).TotalDays) > (double)mHistoryAgeDays || urlInfo.Second.Count > mMaxQueueSize)) { string rmvUrlKey = urlInfo.Second.Dequeue().mUrlKey; if (rmvUrlKey != null) { urlInfo.First.Remove(rmvUrlKey); } TextBlockHistoryEntry oldestEntry = textBlockInfo.Second.Dequeue(); textBlockInfo.First.Remove(oldestEntry.mResponseUrl, oldestEntry.mHashCodes, oldestEntry.mFullPath, /*unique=*/ true, oldestEntry.mDecDocCount); mLogger.Info("RemoveItems", "Removed entry from URL tree. UrlKey={0} QueueSize={1} Age={2}", rmvUrlKey, urlInfo.Second.Count, ageDays); } }
private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId, string documentId, string domainName, DateTime time, bool incDocCount) { UrlTree urlTree = textBlockInfo.First; Queue <TextBlockHistoryEntry> queue = textBlockInfo.Second; TextBlockHistoryEntry historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount); urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount); queue.Enqueue(historyEntry); if (mDbConnection != null) { BinarySerializer memSer = new BinarySerializer(); historyEntry.mHashCodes.Save(memSer); byte[] buffer = ((MemoryStream)memSer.Stream).GetBuffer(); string hashCodesBase64 = Convert.ToBase64String(buffer, 0, (int)memSer.Stream.Position); mDbConnection.ExecuteNonQuery("insert into TextBlocks (corpusId, docId, hashCodes) values (?, ?, ?)", corpusId, documentId, hashCodesBase64); } }
private void AddToUrlTree(Pair<UrlTree, Queue<TextBlockHistoryEntry>> textBlockInfo, string responseUrl, ArrayList<ulong> hashCodes, bool fullPath, string corpusId, string documentId, string domainName, DateTime time, bool incDocCount) { UrlTree urlTree = textBlockInfo.First; Queue<TextBlockHistoryEntry> queue = textBlockInfo.Second; TextBlockHistoryEntry historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/incDocCount); urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/true, incDocCount); queue.Enqueue(historyEntry); if (mDbConnection != null) { BinarySerializer memSer = new BinarySerializer(); historyEntry.mHashCodes.Save(memSer); byte[] buffer = ((MemoryStream)memSer.Stream).GetBuffer(); string hashCodesBase64 = Convert.ToBase64String(buffer, 0, (int)memSer.Stream.Position); mDbConnection.ExecuteNonQuery("insert into TextBlocks (corpusId, docId, hashCodes) values (?, ?, ?)", corpusId, documentId, hashCodesBase64); } }
public static void InitializeHistory(DatabaseConnection dbConnection) { Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent)); logger.Info("InitializeHistory", "Loading history ..."); mUrlInfo.Clear(); mTextBlockInfo.Clear(); DataTable domainsTbl = dbConnection.ExecuteQuery(string.Format(@" SELECT DISTINCT domain FROM (SELECT * FROM (SELECT TOP {0} domain FROM Documents WHERE domain IS NOT NULL GROUP BY domain ORDER BY MAX(time) DESC) x UNION SELECT * FROM (SELECT TOP {0} domain FROM Documents WHERE domain IS NOT NULL GROUP BY domain ORDER BY COUNT(*) DESC) y) z", 3000/*make this configurable!!*/)); int domainCount = 0; foreach (DataRow row in domainsTbl.Rows) { string domainName = (string)row["domain"]; DataTable urlInfoTbl = dbConnection.ExecuteQuery(string.Format(@" SELECT TOP {0} d.id, d.corpusId, d.time, d.responseUrl, d.urlKey, d.rev, d.domain, (SELECT TOP 1 dd.rev from Documents dd WHERE dd.urlKey = d.urlKey ORDER BY dd.time DESC, dd.rev DESC) AS maxRev, tb.hashCodes FROM Documents d INNER JOIN TextBlocks tb ON d.corpusId = tb.corpusId AND d.id = tb.docId WHERE d.domain = ? ORDER BY d.time DESC", mMaxQueueSize), domainName); if (urlInfoTbl.Rows.Count == 0) { continue; } Pair<UrlTree, Queue<TextBlockHistoryEntry>> textBlockInfo = GetTextBlockInfo(domainName); DateTime then = DateTime.Parse((string)urlInfoTbl.Rows[0]["time"]) - new TimeSpan(mHistoryAgeDays, 0, 0, 0); domainCount++; Console.WriteLine("* " + domainName + string.Format(" ({0}/{1})", domainCount, domainsTbl.Rows.Count)); Pair<Dictionary<string, Ref<int>>, Queue<UrlHistoryEntry>> urlInfo = GetUrlInfo(domainName); for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--) { int rev = (int)urlInfoTbl.Rows[j]["rev"]; int maxRev = (int)urlInfoTbl.Rows[j]["maxRev"]; string urlKey = (string)urlInfoTbl.Rows[j]["urlKey"]; string timeStr = (string)urlInfoTbl.Rows[j]["time"]; Guid corpusId = new Guid((string)urlInfoTbl.Rows[j]["corpusId"]); Guid docId = new Guid((string)urlInfoTbl.Rows[j]["id"]); DateTime time = DateTime.Parse(timeStr); if (time >= then) { // URL cache if (rev == 1) { if (urlInfo.First.ContainsKey(urlKey)) { Remove(urlKey, urlInfo); } //Console.WriteLine(maxRev); urlInfo.First.Add(urlKey, new Ref<int>(maxRev)); urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time)); } else { urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue) } // URL tree string hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodes"]; string responseUrl = (string)urlInfoTbl.Rows[j]["responseUrl"]; byte[] buffer = Convert.FromBase64String(hashCodesBase64); BinarySerializer memSer = new BinarySerializer(new MemoryStream(buffer)); ArrayList<ulong> hashCodes = new ArrayList<ulong>(memSer); bool fullPath = urlKey.Contains("?"); TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/rev == 1); textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/true, /*incDocCount=*/rev == 1); textBlockInfo.Second.Enqueue(entry); } } } logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount); }
public static void InitializeHistory(string dbConnectionString) { Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent)); logger.Info("InitializeHistory", "Loading history ..."); mUrlInfo.Clear(); mTextBlockInfo.Clear(); int domainCount = 0; using (SqlConnection dbConnection = new SqlConnection(dbConnectionString)) { dbConnection.Open(); DataTable domainsTbl; using (SqlCommand sqlCmd = new SqlCommand(string.Format(@" SELECT DISTINCT domainName FROM ( SELECT * FROM (SELECT TOP {0} domainName FROM Documents WHERE domainName IS NOT NULL GROUP BY domainName ORDER BY MAX(time) DESC) x UNION SELECT * FROM (SELECT TOP {0} domainName FROM Documents WHERE domainName IS NOT NULL GROUP BY domainName ORDER BY COUNT(*) DESC) y ) xy", 3000 /*make this configurable*/), dbConnection)) { domainsTbl = new DataTable(); using (SqlDataReader sqlReader = sqlCmd.ExecuteReader()) { domainsTbl.Load(sqlReader); } } foreach (DataRow row in domainsTbl.Rows) { string domainName = (string)row["domainName"]; DataTable urlInfoTbl; using (SqlCommand sqlCmd = new SqlCommand(string.Format(string.Format(@" SELECT TOP {0} d.guid, d.time, d.responseUrl, d.urlKey, d.rev, d.domainName, (SELECT TOP 1 dd.rev from Documents dd WHERE dd.urlKey = d.urlKey ORDER BY dd.time DESC, dd.rev DESC) AS maxRev, tb.hashCodes FROM Documents d INNER JOIN TextBlocks tb ON d.guid = tb.docGuid WHERE d.domainName = @domainName ORDER BY d.time DESC ", mMaxQueueSize)), dbConnection)) { sqlCmd.AssignParams("domainName", domainName); urlInfoTbl = new DataTable(); using (SqlDataReader sqlReader = sqlCmd.ExecuteReader()) { urlInfoTbl.Load(sqlReader); } } if (urlInfoTbl.Rows.Count == 0) { continue; } Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo = GetTextBlockInfo(domainName); DateTime then = (DateTime)urlInfoTbl.Rows[0]["time"] - new TimeSpan(mHistoryAgeDays, 0, 0, 0); domainCount++; Console.WriteLine("* " + domainName + string.Format(" ({0}/{1})", domainCount, domainsTbl.Rows.Count)); Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName); for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--) { int rev = (int)urlInfoTbl.Rows[j]["rev"]; int maxRev = (int)urlInfoTbl.Rows[j]["maxRev"]; string urlKey = (string)urlInfoTbl.Rows[j]["urlKey"]; Guid docId = (Guid)urlInfoTbl.Rows[j]["guid"]; DateTime time = (DateTime)urlInfoTbl.Rows[j]["time"]; if (time >= then) { // URL cache if (rev == 1) { if (urlInfo.First.ContainsKey(urlKey)) { Remove(urlKey, urlInfo); } urlInfo.First.Add(urlKey, new Ref <int>(maxRev)); urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time)); } else { urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/ null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue) } // URL tree //string hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodesBase64"]; string responseUrl = (string)urlInfoTbl.Rows[j]["responseUrl"]; //byte[] buffer = Convert.FromBase64String(hashCodesBase64); byte[] buffer = (byte[])urlInfoTbl.Rows[j]["hashCodes"]; BinarySerializer memSer = new BinarySerializer(new MemoryStream(buffer)); ArrayList <ulong> hashCodes = new ArrayList <ulong>(memSer); bool fullPath = urlKey.Contains("?"); TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ rev == 1); textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, /*incDocCount=*/ rev == 1); textBlockInfo.Second.Enqueue(entry); } } } } logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount); }