Example #1
0
 public override PageCrawl InsertPageCrawl(PageCrawl crawl)
 {
     Console.WriteLine("Logger database crawl insert:");
     Console.WriteLine(crawl.Page.Uri.ToString());
     Console.WriteLine(crawl.Content.Length + " bytes");
     Console.WriteLine(crawl.CrawlTime);
     Console.WriteLine();
     return(crawl);
 }
Example #2
0
        public PageCrawl InsertPageCrawl(PageCrawl crawl)
        {
            Console.WriteLine("FILE DB");
            string crawlPath = GetPageCrawlPath(crawl);

            // Ensure contains no invalid chars
            foreach (char invalidChar in Path.GetInvalidFileNameChars().Concat(Path.GetInvalidPathChars()))
            {
                if (invalidChar == '/' || invalidChar == '\\')
                {
                    continue;                                            // leaves slashes to preserve file path
                }
                crawlPath = crawlPath.Replace(invalidChar, '_');
            }

            if (String.IsNullOrWhiteSpace(crawlPath) || String.IsNullOrEmpty(crawlPath))
            {
                crawlPath = "%20";
            }

            // creating an existing directory has no effect so we can do that every time
            try
            {
                Directory.CreateDirectory(Path.GetDirectoryName(crawlPath));
                StreamWriter streamWriter = File.CreateText(crawlPath);

                streamWriter.Write(crawl.Page.Uri.ToString());
                streamWriter.Write(DATA_SEPARATOR);
                streamWriter.Write(crawl.Page.LinkedFrom);
                streamWriter.Write(DATA_SEPARATOR);
                streamWriter.Write(crawl.CrawlTime);
                streamWriter.Write(DATA_SEPARATOR);
                streamWriter.Write(crawl.Content);
                streamWriter.Close();
            }
            catch (Exception ex)
            {
                string errMsg = "Failed to save crawl to disk - " + crawl.Page.Uri.ToString() + " at " + crawlPath;
                Console.WriteLine(errMsg);
                throw new IOException(errMsg, ex);
            }

            Console.WriteLine("Saved crawl to " + crawlPath);

            return(crawl);
        }
Example #3
0
        public IEnumerable <PageCrawl> Start()
        {
            while (frontier.Count > 0)
            {
                Console.WriteLine("Frontier size: " + frontier.Count);

                Page currentCrawlPage = frontier.Dequeue();

                if (ShouldSkip(currentCrawlPage.Uri))
                {
                    continue;
                }

                latestCrawl = Browser.Crawl(currentCrawlPage);

                currentCrawlPage.Crawl = latestCrawl;
                visited.Add(currentCrawlPage.ToString());
                // Ensure crawl didn't fail
                if (latestCrawl == null)
                {
                    continue;
                }

                Uri[] newLinks = LinkParser.Parse(latestCrawl.Content, latestCrawl.Page.Uri);

                // Add links and new crawl

                foreach (Uri uri in newLinks)
                {
                    if (!visited.Contains(uri.ToString()))
                    {
                        frontier.Enqueue(new Page(uri, currentCrawlPage));
                    }
                }

                yield return(latestCrawl);

                int millisecondDelay = robots[currentCrawlPage.Uri.Authority].CrawlDelay * 1000;
                millisecondDelay = (int)Math.Max(0, millisecondDelay - (DateTime.Now - latestCrawl.CrawlTime).TotalMilliseconds);
                Console.WriteLine("Waiting for " + millisecondDelay + "ms");
                Thread.Sleep(millisecondDelay);
            }
        }
Example #4
0
 public virtual PageCrawl InsertPageCrawl(PageCrawl crawl)
 {
     throw new NotImplementedException();
 }
Example #5
0
 protected string GetPageCrawlPath(PageCrawl crawl)
 {
     return(GetHostPath(crawl) + crawl.Page.Uri.AbsolutePath + "-" + crawl.CrawlTime.ToString("yyyy'-'MM'-'dd-HH'-'mm'-'ss") + ".txt");
 }
Example #6
0
 protected string GetHostPath(PageCrawl crawl)
 {
     return(GetDatabaseDir() + "/" + crawl.Page.Uri.Host);
 }