Exemple #1
0
        //Crawl Instance
        private async static void StartCrawlSitesAsync()
        {
            string[] lines;

            string crawledUrl          = null;
            string a                   = null;
            var    siteToCrawlProvider = new SiteToCrawlProvider();
            var    list                = new List <string>();
            var    fileStream          = new FileStream("Sites.txt", FileMode.Open, FileAccess.Read);//open Sites.txt to read websites to be crawled

            using (var streamReader = new StreamReader(fileStream, Encoding.UTF8))
            {
                string line;
                while ((line = streamReader.ReadLine()) != null)
                {
                    siteToCrawlProvider.AddSitesToCrawl(new List <SiteToCrawl>
                    {
                        new SiteToCrawl {
                            Uri = new Uri(line)
                        }
                    });                      // add sites to a list

                    Console.WriteLine(line); //display the read sites

                    a = line;
                }


                //Create the crawl engine instance
                var crawlEngine = new ParallelCrawlerEngine(new ParallelImplementationContainer
                {
                    SiteToCrawlProvider = siteToCrawlProvider
                });

                //Register for site level events
                crawlEngine.AllCrawlsCompleted += (sender, eventArgs) =>
                {
                    Console.WriteLine("Completed crawling all sites");
                };
                crawlEngine.SiteCrawlCompleted += (sender, eventArgs) =>
                {
                    Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri);

                    WebClient webClient = new WebClient();
                    Console.WriteLine(line);
                    webClient.DownloadFileCompleted   += new AsyncCompletedEventHandler(Completed);
                    webClient.DownloadProgressChanged += new DownloadProgressChangedEventHandler(ProgressChanged);
                    // Create Folder
                    // Specify a name for your top-level folder.
                    string folderName = @"c:\files\";


                    // You can write out the path name directly instead of using the Combine
                    // method. Combine just makes the process easier.
                    string pathString = @"c:\files\CryptoCrawler\";
                    System.IO.Directory.CreateDirectory(pathString);
                    // You can extend the depth of your path if you want to.
                    //pathString = System.IO.Path.Combine(pathString, "SubSubFolder");

                    // Create the subfolder. You can verify in File Explorer that you have this
                    // structure in the C: drive.
                    //    Local Disk (C:)
                    //        Top-Level Folder
                    //            SubFolder
                    //System.IO.Directory.CreateDirectory(pathString);

                    // Create a file name for the file you want to create.
                    //string fileName = System.IO.Path.GetRandomFileName();

                    // This example uses a random string for the name, but you also can specify
                    // a particular name.
                    string fileName = "NewFile.htm";

                    // Use Combine again to add the file name to the path.
                    //string path = NextAvailableFilename(pathString);

                    // Verify the path that you have constructed.
                    Console.WriteLine("Path to my file: {0}\n", pathString);

                    // Check that the file doesn't already exist. If it doesn't exist, create
                    // the file and write integers 0 - 99 to it.
                    // DANGER: System.IO.File.Create will overwrite the file if it already exists.
                    // This could happen even with random file names, although it is unlikely.


                    webClient.DownloadFileAsync((eventArgs.CrawledSite.SiteToCrawl.Uri), NextAvailableFilename(pathString +
                                                                                                               fileName));

                    List <UrlInfo> eCrawlResult   = new List <UrlInfo>();
                    DateTime       date           = DateTime.Now; // will give the date for today
                    string         dateWithFormat = date.ToLongDateString();
                    crawledUrl = eventArgs.CrawledSite.SiteToCrawl.Uri.ToString();
                    eCrawlResult.AddRange(new UrlInfo[] {
                        new UrlInfo((eventArgs.CrawledSite.SiteToCrawl.Uri).ToString(), (eventArgs.CrawledSite.SiteToCrawl.Id).ToString(), DateTime.Now.ToString())
                    });
                    string Path = (@"c:\files\");
                    //Stream stream = File.OpenWrite(Environment.CurrentDirectory + "\\mytext.txt");
                    XmlSerializer          serial = new XmlSerializer(typeof(List <UrlInfo>));// serialize addresses to xml file
                    System.IO.StreamWriter writer = new System.IO.StreamWriter(@"C:\files\url.xml", true);
                    urls = eventArgs.CrawledSite.SiteToCrawl.Uri.ToString();


                    serial.Serialize(writer, eCrawlResult);


                    writer.Close();


                    // Keep the console window open in debug mode.
                    //System.Console.WriteLine("Press any key to exit.");
                    //System.Console.ReadKey();
                    //webClient.DownloadFileAsync(eventArgs.CrawledSite.SiteToCrawl.Uri, @"c:\files\myfile.htm");
                };
                crawlEngine.CrawlerInstanceCreated += (sender, eventArgs) =>
                {
                    //Register for crawler level events. These are Abot's events!!!
                    eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) =>
                    {
                        Console.WriteLine("You have the crawled page here in abotEventArgs.CrawledPage...");
                    };
                };

                await crawlEngine.StartAsync();

                Test(crawledUrl);
                Console.WriteLine("Press enter key to stop");
                Console.Read();

                //crawlEngine.Stop();
                Console.WriteLine(line);
            }
        }
Exemple #2
0
        public void Run()
        {
            var resources = SqlHelper.UnloadResources();

            if (!resources.Any())
            {
                return;
            }

            var siteToCrawls = new List <SiteToCrawl>();

            foreach (var res in resources)
            {
                for (var i = 0; i < 10; i++)
                {
                    siteToCrawls.Add(new SiteToCrawl
                    {
                        Uri     = new Uri(string.Format(_urlPattern, res, 10 * i)),
                        SiteBag = new { Name = res, Number = i + 1 }
                    });
                }
            }

            CrawlConfigurationX config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            XmlConfigurator.Configure();//So the logger

            var siteToCrawlProvider = new SiteToCrawlProvider();

            siteToCrawlProvider.AddSitesToCrawl(siteToCrawls);

            //Create the crawl engine instance
            var impls = new ParallelImplementationOverride(
                config,
                new ParallelImplementationContainer
            {
                SiteToCrawlProvider = siteToCrawlProvider
            }
                );

            _crawlerEngine = new ParallelCrawlerEngine(config, impls);

            //Register for site level events
            _crawlerEngine.AllCrawlsCompleted += (sender, eventArgs) =>
            {
                Console.WriteLine("Completed crawling all sites");
                _crawlerEngine.Stop(true);
                Run();
            };
            _crawlerEngine.SiteCrawlCompleted += (sender, eventArgs) =>
            {
                Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri);
            };
            _crawlerEngine.CrawlerInstanceCreated += (sender, eventArgs) =>
            {
                eventArgs.Crawler.CrawlBag = eventArgs.SiteToCrawl.SiteBag;
                //Register for crawler level events. These are Abot's events!!!
                eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) =>
                {
                    var         crawlX      = abotSender as CrawlerX;
                    CrawledPage crawledPage = abotEventArgs.CrawledPage;

                    if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                    {
                        Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
                    }
                    else
                    {
                        if (string.IsNullOrEmpty(crawledPage.Content.Text))
                        {
                            Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                        }
                        else
                        {
                            try
                            {
                                if (crawledPage.CrawlDepth == 1)
                                {
                                    Console.WriteLine("Depth: {0} --- Crawl of page succeeded {1}", crawledPage.CrawlDepth, crawledPage.Uri.AbsoluteUri);
                                    var item = new CrawledItem()
                                    {
                                        Name       = crawlX.CrawlBag.Name,
                                        PageNumber = crawlX.CrawlBag.Number,
                                        Url        = crawledPage.Uri.AbsoluteUri,
                                        Detail     = crawledPage.Content.Text
                                    };

                                    SqlHelper.Store(new System.Collections.Generic.List <CrawledItem>()
                                    {
                                        item
                                    });
                                }
                            }
                            catch (Exception e)
                            {
                                Console.WriteLine(e.Message);
                            }
                        }
                    }

                    //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
                    //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser
                };
                eventArgs.Crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
                {
                    CrawlDecision decision = new CrawlDecision {
                        Allow = true
                    };

                    var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd");
                    var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link");

                    if (isCrawlDepth1 || isCrawlDepth2)
                    {
                        return new CrawlDecision {
                            Allow = false, Reason = "Dont want to crawl google pages"
                        }
                    }
                    ;

                    return(decision);
                });
            };

            _crawlerEngine.StartAsync();
        }