/* ========== Private Members ======== */


        /* ======= Class Constructors ======== */
        // ? public DataScraper() {}
        // ? public DataCrawler(CrawlConfigurationX configX) {}

        #region Public Class Methods
        /* ================================= Class Methods {Public} ============================ */

        /// <summary>
        /// Static method for crawling. Pass in a configuration
        /// (i.e. specify how many sites to crawl, whether or not to
        /// render js, etc) then creates and executes crawler
        /// </summary>
        public static async Task Crawl(CrawlConfigurationX configX, HttpClientHandler httpHandler, PageHandlerType pageHandlerType, string uriToCrawl = "http://google.com")
        {
            // 'using' sets up scope for crawlerX object to be used
            // disposes of object at end of scope. (i.e. close-curly-brace)
            // I saw this used in the github example. Maybe its good practice??

            ImplementationContainer impContainer = new ImplementationContainer();

            impContainer.PageRequester = new ProxyPageRequester(httpHandler, configX, new WebContentExtractor(), null);

            ImplementationOverride impOverride = new ImplementationOverride(configX, impContainer);

            using (var crawlerX = new CrawlerX(configX, impOverride))
            {
                crawlerX.ShouldRenderPageJavascript((CrawledPage, CrawlContext) =>
                {
                    if (CrawledPage.Uri.AbsoluteUri.Contains("ghost"))
                    {
                        return new CrawlDecision {
                            Allow = false, Reason = "scared to render ghost javascript."
                        }
                    }
                    ;

                    return(new CrawlDecision {
                        Allow = true
                    });
                });

                switch (pageHandlerType)
                {
                case PageHandlerType.wordFrequency:
                    //add handler to be called when the crawl for that page is complete
                    crawlerX.PageCrawlCompleted += WordFrequencyHandler;
                    break;

                case PageHandlerType.sentimentAnalysis:
                    crawlerX.PageCrawlCompleted += SentimentAnalysisHandler;
                    break;
                }

                await crawlerX.CrawlAsync(new Uri(uriToCrawl));
            }
        }
Exemple #2
0
        public void Run()
        {
            var resources = SqlHelper.UnloadResources();

            if (!resources.Any())
            {
                return;
            }

            var siteToCrawls = new List <SiteToCrawl>();

            foreach (var res in resources)
            {
                for (var i = 0; i < 10; i++)
                {
                    siteToCrawls.Add(new SiteToCrawl
                    {
                        Uri     = new Uri(string.Format(_urlPattern, res, 10 * i)),
                        SiteBag = new { Name = res, Number = i + 1 }
                    });
                }
            }

            CrawlConfigurationX config = AbotXConfigurationSectionHandler.LoadFromXml().Convert();

            XmlConfigurator.Configure();//So the logger

            var siteToCrawlProvider = new SiteToCrawlProvider();

            siteToCrawlProvider.AddSitesToCrawl(siteToCrawls);

            //Create the crawl engine instance
            var impls = new ParallelImplementationOverride(
                config,
                new ParallelImplementationContainer
            {
                SiteToCrawlProvider = siteToCrawlProvider
            }
                );

            _crawlerEngine = new ParallelCrawlerEngine(config, impls);

            //Register for site level events
            _crawlerEngine.AllCrawlsCompleted += (sender, eventArgs) =>
            {
                Console.WriteLine("Completed crawling all sites");
                _crawlerEngine.Stop(true);
                Run();
            };
            _crawlerEngine.SiteCrawlCompleted += (sender, eventArgs) =>
            {
                Console.WriteLine("Completed crawling site {0}", eventArgs.CrawledSite.SiteToCrawl.Uri);
            };
            _crawlerEngine.CrawlerInstanceCreated += (sender, eventArgs) =>
            {
                eventArgs.Crawler.CrawlBag = eventArgs.SiteToCrawl.SiteBag;
                //Register for crawler level events. These are Abot's events!!!
                eventArgs.Crawler.PageCrawlCompleted += (abotSender, abotEventArgs) =>
                {
                    var         crawlX      = abotSender as CrawlerX;
                    CrawledPage crawledPage = abotEventArgs.CrawledPage;

                    if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
                    {
                        Console.WriteLine("Crawl of page failed {0}", crawledPage.Uri.AbsoluteUri);
                    }
                    else
                    {
                        if (string.IsNullOrEmpty(crawledPage.Content.Text))
                        {
                            Console.WriteLine("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                        }
                        else
                        {
                            try
                            {
                                if (crawledPage.CrawlDepth == 1)
                                {
                                    Console.WriteLine("Depth: {0} --- Crawl of page succeeded {1}", crawledPage.CrawlDepth, crawledPage.Uri.AbsoluteUri);
                                    var item = new CrawledItem()
                                    {
                                        Name       = crawlX.CrawlBag.Name,
                                        PageNumber = crawlX.CrawlBag.Number,
                                        Url        = crawledPage.Uri.AbsoluteUri,
                                        Detail     = crawledPage.Content.Text
                                    };

                                    SqlHelper.Store(new System.Collections.Generic.List <CrawledItem>()
                                    {
                                        item
                                    });
                                }
                            }
                            catch (Exception e)
                            {
                                Console.WriteLine(e.Message);
                            }
                        }
                    }

                    //var htmlAgilityPackDocument = crawledPage.HtmlDocument; //Html Agility Pack parser
                    //var angleSharpHtmlDocument = crawledPage.AngleSharpHtmlDocument; //AngleSharp parser
                };
                eventArgs.Crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
                {
                    CrawlDecision decision = new CrawlDecision {
                        Allow = true
                    };

                    var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd");
                    var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link");

                    if (isCrawlDepth1 || isCrawlDepth2)
                    {
                        return new CrawlDecision {
                            Allow = false, Reason = "Dont want to crawl google pages"
                        }
                    }
                    ;

                    return(decision);
                });
            };

            _crawlerEngine.StartAsync();
        }
        // PRIVATE CLASS MEMBERS

        // MAIN ===================================================================================
        /// <param name="args"> Command line arguements passed to executable. </param>
        static async Task Main(string[] args)
        {
            // Creates a Logger object from Serilog. Writes up to Debug level prints.
            SetupLogger();

            Log.Logger.Information("Darkweb Data Scraper start...");

            // Parses command line arguements and stores them in "parsedArgs"
            SetupParser(args);

            // I made this function to move this setup out of main.
            // Returns a TorSharpSettings object for use with TorSharp.
            var settings = SetupTorSharpSettings();

            // Idk exactly how this works but like... its for torsharp
            // its uh... setting up torsharp "tools"...also its asyncronous
            await SetupTorSharpTools(settings);

            // * starts tor proxy -----------------------------------------------------------------
            using (var proxy = new TorSharpProxy(settings))
            {
                var waiting = true;
                while (waiting)
                {
                    // block untill we wait for TorSharp Proxy to be configured
                    await proxy.ConfigureAndStartAsync();

                    waiting = false;
                }

                // * SETUP AND EXECUTE CRAWLER ================================================
                // Setup Crawler configuration
                CrawlConfigurationX crawlConfig = new CrawlConfigurationX
                {
                    MaxPagesToCrawl = 30,                               // Max total urls this crawler should crawl
                    MaxCrawlDepth   = 1,                                // Depth for crawler to traverse urls
                    IsJavascriptRenderingEnabled = false,               // Should crawler render JS?
                    JavascriptRenderingWaitTimeInMilliseconds = 2000,   // How long to wait for js to process
                    MaxConcurrentSiteCrawls = 1,                        // Only crawl a single site at a time
                    MaxRetryCount           = 3                         // Retries to connect and crawl site 'x' times
                };

                if (parsedArgs.InputFile == null) // THIS IS "-s" or "--single"
                {
                    var handler = new HttpClientHandler
                    {
                        Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port))
                    };

                    // Crawl
                    await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, parsedArgs.StartingUri);

                    BuildBsonDocument(DataScraper.allParsedText, parsedArgs.StartingUri);

                    //reset vals for next crawl
                    DataScraper.allParsedText = new List <string>();
                    DataScraper.siteTitle     = "";
                    DataScraper.dataDocuments = new List <BsonDocument>();

                    if (MONGO_URI == "")
                    {
                        Log.Logger.Information("Database information is no longer accessible or available." +
                                               "You will need to provide your own Mongo details in \"Crawler.cs\".");
                    }
                }
                else // THIS IS "-m" or "--multi"
                {
                    string inputFilePath = @parsedArgs.InputFile;

                    var sitesToCrawl = GenerateSiteList(inputFilePath);

                    for (int i = 0; i < sitesToCrawl.Count; i++)
                    {
                        var handler = new HttpClientHandler
                        {
                            Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port))
                        };

                        // Crawl
                        await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, sitesToCrawl[i]);

                        BuildBsonDocument(DataScraper.allParsedText, sitesToCrawl[i]);

                        if (MONGO_URI == "")
                        {
                            Log.Logger.Information("Database information is no longer accessible or available." +
                                                   "You will need to provide your own Mongo details in \"Crawler.cs\".");
                        }

                        if (MONGO_URI != "" && MONGO_COLLECTION_NAME != "" && MONGO_DATABASE_NAME != "")
                        {
                            var client     = new MongoClient(MONGO_URI);
                            var database   = client.GetDatabase(MONGO_DATABASE_NAME);
                            var collection = database.GetCollection <BsonDocument>(MONGO_COLLECTION_NAME);

                            collection.InsertMany(DataScraper.dataDocuments);
                        }

                        //reset vals for next crawl
                        DataScraper.allParsedText = new List <string>();
                        DataScraper.siteTitle     = "";
                        DataScraper.dataDocuments = new List <BsonDocument>();
                    }
                }
                // * ==========================================================================

                // Stop the TorSharp tools so that the proxy is no longer listening on the configured port.
                proxy.Stop();
            }
            // * ----------------------------------------------------------------------------------
        }