public override PageCrawl InsertPageCrawl(PageCrawl crawl) { Console.WriteLine("Logger database crawl insert:"); Console.WriteLine(crawl.Page.Uri.ToString()); Console.WriteLine(crawl.Content.Length + " bytes"); Console.WriteLine(crawl.CrawlTime); Console.WriteLine(); return(crawl); }
public PageCrawl InsertPageCrawl(PageCrawl crawl) { Console.WriteLine("FILE DB"); string crawlPath = GetPageCrawlPath(crawl); // Ensure contains no invalid chars foreach (char invalidChar in Path.GetInvalidFileNameChars().Concat(Path.GetInvalidPathChars())) { if (invalidChar == '/' || invalidChar == '\\') { continue; // leaves slashes to preserve file path } crawlPath = crawlPath.Replace(invalidChar, '_'); } if (String.IsNullOrWhiteSpace(crawlPath) || String.IsNullOrEmpty(crawlPath)) { crawlPath = "%20"; } // creating an existing directory has no effect so we can do that every time try { Directory.CreateDirectory(Path.GetDirectoryName(crawlPath)); StreamWriter streamWriter = File.CreateText(crawlPath); streamWriter.Write(crawl.Page.Uri.ToString()); streamWriter.Write(DATA_SEPARATOR); streamWriter.Write(crawl.Page.LinkedFrom); streamWriter.Write(DATA_SEPARATOR); streamWriter.Write(crawl.CrawlTime); streamWriter.Write(DATA_SEPARATOR); streamWriter.Write(crawl.Content); streamWriter.Close(); } catch (Exception ex) { string errMsg = "Failed to save crawl to disk - " + crawl.Page.Uri.ToString() + " at " + crawlPath; Console.WriteLine(errMsg); throw new IOException(errMsg, ex); } Console.WriteLine("Saved crawl to " + crawlPath); return(crawl); }
public IEnumerable <PageCrawl> Start() { while (frontier.Count > 0) { Console.WriteLine("Frontier size: " + frontier.Count); Page currentCrawlPage = frontier.Dequeue(); if (ShouldSkip(currentCrawlPage.Uri)) { continue; } latestCrawl = Browser.Crawl(currentCrawlPage); currentCrawlPage.Crawl = latestCrawl; visited.Add(currentCrawlPage.ToString()); // Ensure crawl didn't fail if (latestCrawl == null) { continue; } Uri[] newLinks = LinkParser.Parse(latestCrawl.Content, latestCrawl.Page.Uri); // Add links and new crawl foreach (Uri uri in newLinks) { if (!visited.Contains(uri.ToString())) { frontier.Enqueue(new Page(uri, currentCrawlPage)); } } yield return(latestCrawl); int millisecondDelay = robots[currentCrawlPage.Uri.Authority].CrawlDelay * 1000; millisecondDelay = (int)Math.Max(0, millisecondDelay - (DateTime.Now - latestCrawl.CrawlTime).TotalMilliseconds); Console.WriteLine("Waiting for " + millisecondDelay + "ms"); Thread.Sleep(millisecondDelay); } }
public virtual PageCrawl InsertPageCrawl(PageCrawl crawl) { throw new NotImplementedException(); }
protected string GetPageCrawlPath(PageCrawl crawl) { return(GetHostPath(crawl) + crawl.Page.Uri.AbsolutePath + "-" + crawl.CrawlTime.ToString("yyyy'-'MM'-'dd-HH'-'mm'-'ss") + ".txt"); }
protected string GetHostPath(PageCrawl crawl) { return(GetDatabaseDir() + "/" + crawl.Page.Uri.Host); }