コード例 #1
0
        public static void InitializeHistory(DatabaseConnection dbConnection)
        {
            Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent));

            logger.Info("InitializeHistory", "Loading history ...");
            mUrlInfo.Clear();
            mTextBlockInfo.Clear();
            DataTable domainsTbl  = dbConnection.ExecuteQuery("select distinct domain from Documents where domain is not null");
            int       domainCount = 0;

            foreach (DataRow row in domainsTbl.Rows)
            {
                string    domainName = (string)row["domain"];
                DataTable urlInfoTbl = dbConnection.ExecuteQuery(string.Format("select top {0} d.id, d.corpusId, d.time, d.responseUrl, d.urlKey, d.rev, r.maxRev, tb.hashCodes from TextBlocks tb, Documents d, (select urlKey, max(rev) as maxRev from Documents group by urlKey) r where d.corpusId = tb.corpusId and d.id = tb.docId and r.urlKey = d.urlKey and d.domain = ? order by time desc", mMaxQueueSize), domainName);
                if (urlInfoTbl.Rows.Count == 0)
                {
                    continue;
                }
                //Console.WriteLine(domainName + " " + urlInfoTbl.Rows.Count.ToString());
                Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo = GetTextBlockInfo(domainName);
                DateTime then = DateTime.Parse((string)urlInfoTbl.Rows[0]["time"]) - new TimeSpan(mHistoryAgeDays, 0, 0, 0);
                domainCount++;
                Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName);
                for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--)
                {
                    int      maxRev   = (int)urlInfoTbl.Rows[j]["maxRev"];
                    int      rev      = (int)urlInfoTbl.Rows[j]["rev"];
                    string   urlKey   = (string)urlInfoTbl.Rows[j]["urlKey"];
                    string   timeStr  = (string)urlInfoTbl.Rows[j]["time"];
                    Guid     corpusId = new Guid((string)urlInfoTbl.Rows[j]["corpusId"]);
                    Guid     docId    = new Guid((string)urlInfoTbl.Rows[j]["id"]);
                    DateTime time     = DateTime.Parse(timeStr);
                    if (time >= then)
                    {
                        // URL cache
                        if (rev == 1)
                        {
                            urlInfo.First.Add(urlKey, new Ref <int>(maxRev));
                            urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time));
                        }
                        else
                        {
                            urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/ null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue)
                        }
                        // URL tree
                        string            hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodes"];
                        string            responseUrl     = (string)urlInfoTbl.Rows[j]["responseUrl"];
                        byte[]            buffer          = Convert.FromBase64String(hashCodesBase64);
                        BinarySerializer  memSer          = new BinarySerializer(new MemoryStream(buffer));
                        ArrayList <ulong> hashCodes       = new ArrayList <ulong>(memSer);
                        bool fullPath = urlKey.Contains("?");
                        TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ rev == 1);
                        textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, /*incDocCount=*/ rev == 1);
                        textBlockInfo.Second.Enqueue(entry);
                    }
                }
            }
            logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount);
        }
コード例 #2
0
        private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId,
                                  string documentId, string domainName, DateTime time, bool incDocCount)
        {
            UrlTree urlTree = textBlockInfo.First;
            Queue <TextBlockHistoryEntry> queue        = textBlockInfo.Second;
            TextBlockHistoryEntry         historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount);

            urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount);
            queue.Enqueue(historyEntry);
        }
コード例 #3
0
        private void RemoveItems(Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo, Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, DateTime time)
        {
            double ageDays = 0;

            while (urlInfo.Second.Count > mMinQueueSize && ((ageDays = (time - urlInfo.Second.Peek().mTime).TotalDays) > (double)mHistoryAgeDays || urlInfo.Second.Count > mMaxQueueSize))
            {
                string rmvUrlKey = urlInfo.Second.Dequeue().mUrlKey;
                if (rmvUrlKey != null)
                {
                    urlInfo.First.Remove(rmvUrlKey);
                }
                TextBlockHistoryEntry oldestEntry = textBlockInfo.Second.Dequeue();
                textBlockInfo.First.Remove(oldestEntry.mResponseUrl, oldestEntry.mHashCodes, oldestEntry.mFullPath, /*unique=*/ true, oldestEntry.mDecDocCount);
                mLogger.Info("RemoveItems", "Removed entry from URL tree. UrlKey={0} QueueSize={1} Age={2}", rmvUrlKey, urlInfo.Second.Count, ageDays);
            }
        }
コード例 #4
0
        private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId,
                                  string documentId, string domainName, DateTime time, bool incDocCount)
        {
            UrlTree urlTree = textBlockInfo.First;
            Queue <TextBlockHistoryEntry> queue        = textBlockInfo.Second;
            TextBlockHistoryEntry         historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount);

            urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount);
            queue.Enqueue(historyEntry);
            if (mDbConnection != null)
            {
                BinarySerializer memSer = new BinarySerializer();
                historyEntry.mHashCodes.Save(memSer);
                byte[] buffer          = ((MemoryStream)memSer.Stream).GetBuffer();
                string hashCodesBase64 = Convert.ToBase64String(buffer, 0, (int)memSer.Stream.Position);
                mDbConnection.ExecuteNonQuery("insert into TextBlocks (corpusId, docId, hashCodes) values (?, ?, ?)", corpusId, documentId, hashCodesBase64);
            }
        }
 private void AddToUrlTree(Pair<UrlTree, Queue<TextBlockHistoryEntry>> textBlockInfo, string responseUrl, ArrayList<ulong> hashCodes, bool fullPath, string corpusId,
     string documentId, string domainName, DateTime time, bool incDocCount)
 {
     UrlTree urlTree = textBlockInfo.First;
     Queue<TextBlockHistoryEntry> queue = textBlockInfo.Second;
     TextBlockHistoryEntry historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/incDocCount);
     urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/true, incDocCount);
     queue.Enqueue(historyEntry);
     if (mDbConnection != null)
     {
         BinarySerializer memSer = new BinarySerializer();
         historyEntry.mHashCodes.Save(memSer);
         byte[] buffer = ((MemoryStream)memSer.Stream).GetBuffer();
         string hashCodesBase64 = Convert.ToBase64String(buffer, 0, (int)memSer.Stream.Position);
         mDbConnection.ExecuteNonQuery("insert into TextBlocks (corpusId, docId, hashCodes) values (?, ?, ?)", corpusId, documentId, hashCodesBase64);
     }
 }
 public static void InitializeHistory(DatabaseConnection dbConnection)
 {
     Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent));
     logger.Info("InitializeHistory", "Loading history ...");
     mUrlInfo.Clear();
     mTextBlockInfo.Clear();
     DataTable domainsTbl = dbConnection.ExecuteQuery(string.Format(@"
         SELECT DISTINCT domain FROM
         (SELECT * FROM (SELECT TOP {0} domain FROM Documents WHERE domain IS NOT NULL GROUP BY domain ORDER BY MAX(time) DESC) x
         UNION
         SELECT * FROM (SELECT TOP {0} domain FROM Documents WHERE domain IS NOT NULL GROUP BY domain ORDER BY COUNT(*) DESC) y) z", 3000/*make this configurable!!*/));
     int domainCount = 0;
     foreach (DataRow row in domainsTbl.Rows)
     {
         string domainName = (string)row["domain"];
         DataTable urlInfoTbl = dbConnection.ExecuteQuery(string.Format(@"
             SELECT TOP {0} d.id, d.corpusId, d.time, d.responseUrl, d.urlKey, d.rev, d.domain, (SELECT TOP 1 dd.rev from Documents dd WHERE dd.urlKey = d.urlKey ORDER BY dd.time DESC, dd.rev DESC) AS maxRev, tb.hashCodes FROM Documents d
             INNER JOIN TextBlocks tb ON d.corpusId = tb.corpusId AND d.id = tb.docId WHERE d.domain = ? ORDER BY d.time DESC", mMaxQueueSize), domainName);
         if (urlInfoTbl.Rows.Count == 0) { continue; }
         Pair<UrlTree, Queue<TextBlockHistoryEntry>> textBlockInfo = GetTextBlockInfo(domainName);
         DateTime then = DateTime.Parse((string)urlInfoTbl.Rows[0]["time"]) - new TimeSpan(mHistoryAgeDays, 0, 0, 0);
         domainCount++;
         Console.WriteLine("* " + domainName + string.Format(" ({0}/{1})", domainCount, domainsTbl.Rows.Count));
         Pair<Dictionary<string, Ref<int>>, Queue<UrlHistoryEntry>> urlInfo = GetUrlInfo(domainName);
         for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--)
         {
             int rev = (int)urlInfoTbl.Rows[j]["rev"];
             int maxRev = (int)urlInfoTbl.Rows[j]["maxRev"];
             string urlKey = (string)urlInfoTbl.Rows[j]["urlKey"];
             string timeStr = (string)urlInfoTbl.Rows[j]["time"];
             Guid corpusId = new Guid((string)urlInfoTbl.Rows[j]["corpusId"]);
             Guid docId = new Guid((string)urlInfoTbl.Rows[j]["id"]);
             DateTime time = DateTime.Parse(timeStr);
             if (time >= then)
             {
                 // URL cache
                 if (rev == 1)
                 {
                     if (urlInfo.First.ContainsKey(urlKey)) { Remove(urlKey, urlInfo); }
                     //Console.WriteLine(maxRev);
                     urlInfo.First.Add(urlKey, new Ref<int>(maxRev));
                     urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time));
                 }
                 else
                 {
                     urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue)
                 }
                 // URL tree
                 string hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodes"];
                 string responseUrl = (string)urlInfoTbl.Rows[j]["responseUrl"];
                 byte[] buffer = Convert.FromBase64String(hashCodesBase64);
                 BinarySerializer memSer = new BinarySerializer(new MemoryStream(buffer));
                 ArrayList<ulong> hashCodes = new ArrayList<ulong>(memSer);
                 bool fullPath = urlKey.Contains("?");
                 TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/rev == 1);
                 textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/true, /*incDocCount=*/rev == 1);
                 textBlockInfo.Second.Enqueue(entry);
             }
         }
     }
     logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount);
 }
コード例 #7
0
        public static void InitializeHistory(string dbConnectionString)
        {
            Logger logger = Logger.GetLogger(typeof(UrlTreeBoilerplateRemoverComponent));

            logger.Info("InitializeHistory", "Loading history ...");
            mUrlInfo.Clear();
            mTextBlockInfo.Clear();
            int domainCount = 0;

            using (SqlConnection dbConnection = new SqlConnection(dbConnectionString))
            {
                dbConnection.Open();
                DataTable domainsTbl;
                using (SqlCommand sqlCmd = new SqlCommand(string.Format(@"
                    SELECT DISTINCT domainName FROM (
                        SELECT * FROM (SELECT TOP {0} domainName FROM Documents WHERE domainName IS NOT NULL GROUP BY domainName ORDER BY MAX(time) DESC) x 
                        UNION 
                        SELECT * FROM (SELECT TOP {0} domainName FROM Documents WHERE domainName IS NOT NULL GROUP BY domainName ORDER BY COUNT(*) DESC) y
                    ) xy", 3000 /*make this configurable*/), dbConnection))
                {
                    domainsTbl = new DataTable();
                    using (SqlDataReader sqlReader = sqlCmd.ExecuteReader())
                    {
                        domainsTbl.Load(sqlReader);
                    }
                }
                foreach (DataRow row in domainsTbl.Rows)
                {
                    string    domainName = (string)row["domainName"];
                    DataTable urlInfoTbl;
                    using (SqlCommand sqlCmd = new SqlCommand(string.Format(string.Format(@"
                        SELECT TOP {0} d.guid, d.time, d.responseUrl, d.urlKey, d.rev, d.domainName, (SELECT TOP 1 dd.rev from Documents dd WHERE dd.urlKey = d.urlKey ORDER BY dd.time DESC, dd.rev DESC) AS maxRev, tb.hashCodes FROM Documents d 
                        INNER JOIN TextBlocks tb ON d.guid = tb.docGuid WHERE d.domainName = @domainName ORDER BY d.time DESC
                        ", mMaxQueueSize)), dbConnection))
                    {
                        sqlCmd.AssignParams("domainName", domainName);
                        urlInfoTbl = new DataTable();
                        using (SqlDataReader sqlReader = sqlCmd.ExecuteReader())
                        {
                            urlInfoTbl.Load(sqlReader);
                        }
                    }
                    if (urlInfoTbl.Rows.Count == 0)
                    {
                        continue;
                    }
                    Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo = GetTextBlockInfo(domainName);
                    DateTime then = (DateTime)urlInfoTbl.Rows[0]["time"] - new TimeSpan(mHistoryAgeDays, 0, 0, 0);
                    domainCount++;
                    Console.WriteLine("* " + domainName + string.Format(" ({0}/{1})", domainCount, domainsTbl.Rows.Count));
                    Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName);
                    for (int j = urlInfoTbl.Rows.Count - 1; j >= 0; j--)
                    {
                        int      rev    = (int)urlInfoTbl.Rows[j]["rev"];
                        int      maxRev = (int)urlInfoTbl.Rows[j]["maxRev"];
                        string   urlKey = (string)urlInfoTbl.Rows[j]["urlKey"];
                        Guid     docId  = (Guid)urlInfoTbl.Rows[j]["guid"];
                        DateTime time   = (DateTime)urlInfoTbl.Rows[j]["time"];
                        if (time >= then)
                        {
                            // URL cache
                            if (rev == 1)
                            {
                                if (urlInfo.First.ContainsKey(urlKey))
                                {
                                    Remove(urlKey, urlInfo);
                                }
                                urlInfo.First.Add(urlKey, new Ref <int>(maxRev));
                                urlInfo.Second.Enqueue(new UrlHistoryEntry(urlKey, time));
                            }
                            else
                            {
                                urlInfo.Second.Enqueue(new UrlHistoryEntry(/*urlKey=*/ null, time)); // dummy entry into the URL queue (to ensure sync with the text blocks queue)
                            }
                            // URL tree
                            //string hashCodesBase64 = (string)urlInfoTbl.Rows[j]["hashCodesBase64"];
                            string responseUrl = (string)urlInfoTbl.Rows[j]["responseUrl"];
                            //byte[] buffer = Convert.FromBase64String(hashCodesBase64);
                            byte[]            buffer    = (byte[])urlInfoTbl.Rows[j]["hashCodes"];
                            BinarySerializer  memSer    = new BinarySerializer(new MemoryStream(buffer));
                            ArrayList <ulong> hashCodes = new ArrayList <ulong>(memSer);
                            bool fullPath = urlKey.Contains("?");
                            TextBlockHistoryEntry entry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ rev == 1);
                            textBlockInfo.First.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, /*incDocCount=*/ rev == 1);
                            textBlockInfo.Second.Enqueue(entry);
                        }
                    }
                }
            }
            logger.Info("InitializeHistory", "Loaded history for {0} distinct domains.", domainCount);
        }