//Crawl Instance private async static void StartCrawlSitesAsync() { string[] lines; string crawledUrl = null; string a = null; var siteToCrawlProvider = new SiteToCrawlProvider(); var list = new List <string>(); var fileStream = new FileStream("Sites.txt", FileMode.Open, FileAccess.Read);//open Sites.txt to read websites to be crawled using (var streamReader = new StreamReader(fileStream, Encoding.UTF8)) { string line; while ((line = streamReader.ReadLine()) != null) { siteToCrawlProvider.AddSitesToCrawl(new List <SiteToCrawl> { new SiteToCrawl { Uri = new Uri(line) } }); // add sites to a list Console.WriteLine(line); //display the read sites a = line; } //Create the crawl engine instance var crawlEngine = new ParallelCrawlerEngine(new ParallelImplementationContainer { SiteToCrawlProvider = siteToCrawlProvider }); //Register for site level events crawlEngine.AllCrawlsCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling all sites"); }; crawlEngine.SiteCrawlCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri); WebClient webClient = new WebClient(); Console.WriteLine(line); webClient.DownloadFileCompleted += new AsyncCompletedEventHandler(Completed); webClient.DownloadProgressChanged += new DownloadProgressChangedEventHandler(ProgressChanged); // Create Folder // Specify a name for your top-level folder. string folderName = @"c:\files\"; // You can write out the path name directly instead of using the Combine // method. Combine just makes the process easier. string pathString = @"c:\files\CryptoCrawler\"; System.IO.Directory.CreateDirectory(pathString); // You can extend the depth of your path if you want to. //pathString = System.IO.Path.Combine(pathString, "SubSubFolder"); // Create the subfolder. You can verify in File Explorer that you have this // structure in the C: drive. // Local Disk (C:) // Top-Level Folder // SubFolder //System.IO.Directory.CreateDirectory(pathString); // Create a file name for the file you want to create. //string fileName = System.IO.Path.GetRandomFileName(); // This example uses a random string for the name, but you also can specify // a particular name. string fileName = "NewFile.htm"; // Use Combine again to add the file name to the path. //string path = NextAvailableFilename(pathString); // Verify the path that you have constructed. Console.WriteLine("Path to my file: {0}\n", pathString); // Check that the file doesn't already exist. If it doesn't exist, create // the file and write integers 0 - 99 to it. // DANGER: System.IO.File.Create will overwrite the file if it already exists. // This could happen even with random file names, although it is unlikely. webClient.DownloadFileAsync((eventArgs.CrawledSite.SiteToCrawl.Uri), NextAvailableFilename(pathString + fileName)); List <UrlInfo> eCrawlResult = new List <UrlInfo>(); DateTime date = DateTime.Now; // will give the date for today string dateWithFormat = date.ToLongDateString(); crawledUrl = eventArgs.CrawledSite.SiteToCrawl.Uri.ToString(); eCrawlResult.AddRange(new UrlInfo[] { new UrlInfo((eventArgs.CrawledSite.SiteToCrawl.Uri).ToString(), (eventArgs.CrawledSite.SiteToCrawl.Id).ToString(), DateTime.Now.ToString()) }); string Path = (@"c:\files\"); //Stream stream = File.OpenWrite(Environment.CurrentDirectory + "\\mytext.txt"); XmlSerializer serial = new XmlSerializer(typeof(List <UrlInfo>));// serialize addresses to xml file System.IO.StreamWriter writer = new System.IO.StreamWriter(@"C:\files\url.xml", true); urls = eventArgs.CrawledSite.SiteToCrawl.Uri.ToString(); serial.Serialize(writer, eCrawlResult); writer.Close(); // Keep the console window open in debug mode. //System.Console.WriteLine("Press any key to exit."); //System.Console.ReadKey(); //webClient.DownloadFileAsync(eventArgs.CrawledSite.SiteToCrawl.Uri, @"c:\files\myfile.htm"); }; crawlEngine.CrawlerInstanceCreated += (sender, eventArgs) => { //Register for crawler level events. These are Abot's events!!! eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) => { Console.WriteLine("You have the crawled page here in abotEventArgs.CrawledPage..."); }; }; await crawlEngine.StartAsync(); Test(crawledUrl); Console.WriteLine("Press enter key to stop"); Console.Read(); //crawlEngine.Stop(); Console.WriteLine(line); } }
public void Run() { var resources = SqlHelper.UnloadResources(); if (!resources.Any()) { return; } var siteToCrawls = new List <SiteToCrawl>(); foreach (var res in resources) { for (var i = 0; i < 10; i++) { siteToCrawls.Add(new SiteToCrawl { Uri = new Uri(string.Format(_urlPattern, res, 10 * i)), SiteBag = new { Name = res, Number = i + 1 } }); } } CrawlConfigurationX config = AbotXConfigurationSectionHandler.LoadFromXml().Convert(); XmlConfigurator.Configure();//So the logger var siteToCrawlProvider = new SiteToCrawlProvider(); siteToCrawlProvider.AddSitesToCrawl(siteToCrawls); //Create the crawl engine instance var impls = new ParallelImplementationOverride( config, new ParallelImplementationContainer { SiteToCrawlProvider = siteToCrawlProvider } ); _crawlerEngine = new ParallelCrawlerEngine(config, impls); //Register for site level events _crawlerEngine.AllCrawlsCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling all sites"); _crawlerEngine.Stop(true); Run(); }; _crawlerEngine.SiteCrawlCompleted += (sender, eventArgs) => { Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri); }; _crawlerEngine.CrawlerInstanceCreated += (sender, eventArgs) => { eventArgs.Crawler.CrawlBag = eventArgs.SiteToCrawl.SiteBag; //Register for crawler level events. These are Abot's events!!! eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) => { var crawlX = abotSender as CrawlerX; CrawledPage crawledPage = abotEventArgs.CrawledPage; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri); } else { if (string.IsNullOrEmpty(crawledPage.Content.Text)) { Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri); } else { try { if (crawledPage.CrawlDepth == 1) { Console.WriteLine("Depth: {0} --- Crawl of page succeeded {1}", crawledPage.CrawlDepth, crawledPage.Uri.AbsoluteUri); var item = new CrawledItem() { Name = crawlX.CrawlBag.Name, PageNumber = crawlX.CrawlBag.Number, Url = crawledPage.Uri.AbsoluteUri, Detail = crawledPage.Content.Text }; SqlHelper.Store(new System.Collections.Generic.List <CrawledItem>() { item }); } } catch (Exception e) { Console.WriteLine(e.Message); } } } //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser }; eventArgs.Crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd"); var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link"); if (isCrawlDepth1 || isCrawlDepth2) { return new CrawlDecision { Allow = false, Reason = "Dont want to crawl google pages" } } ; return(decision); }); }; _crawlerEngine.StartAsync(); }