Beispiel #1
0
        // ========================================================================================
        static void BuildBsonDocument(List <string> parsedText, string crawledURL)
        {
            List <BsonDocument> dataDocuments = new List <BsonDocument>();

            // Dictionary containing keywords desired, and a list of all contexts in which they were used
            Dictionary <string, Pair <int, List <string> > > contextCache = DataScraper.GetWordCountAndContext(parsedText, Constants.DefaultIgnoreWordsTXT);

            //make list from dict to sort
            var dictList = contextCache.ToList();

            // Sort takes a comparison operator
            // Comparison(x,y) -> less than 0 if x < y, 0 if equal, greater than 0 if x > y
            // for all keyValuePairs in dict, sort based on the frequency count
            // pair: word : list of
            dictList.Sort((pair1, pair2) => pair1.Value.Item1 > pair2.Value.Item1 ? -1 : 1);

            var sentimentAnalysis = new BsonDocument();

            var numWords = dictList.Count > 100 ? 100 : dictList.Count;

            for (int i = 0; i < numWords; i++)
            {
                //Log.Logger.Debug("Getting Context words for " + dictList[i].Key);
                if (dictList[i].Key == "")
                {
                    continue;  // Skips the stupid empty string keyword problem we havn't fixed yet...
                }

                // Excludes words we don't care about
                var desiredWords = DataScraper.ExcludeWords(dictList[i].Value.Item2);

                //the context sentences
                //number of occurances of context words for a given keyword
                var contextWordCount = DataScraper.GetWordCount(desiredWords);

                sentimentAnalysis.Add(new BsonElement(
                                          dictList[i].Key, new BsonDocument
                {
                    { "Count", dictList[i].Value.Item1 },
                    { "ContextSentences", new BsonArray(dictList[i].Value.Item2) },
                    { "ContextWordFrequency", new BsonDocument(contextWordCount) }
                }
                                          ));
            }

            // BSON doc
            var bson = new BsonDocument
            {
                { "WebsiteTitle", DataScraper.siteTitle },
                { "URL", crawledURL },
                // {"Raw", rawPageText},
                { "SentimentAnalysis", sentimentAnalysis }
            };

            if (bson != null)
            {
                DataScraper.dataDocuments.Add(bson);
            }
        }
Beispiel #2
0
        // PRIVATE CLASS MEMBERS

        // MAIN ===================================================================================
        /// <param name="args"> Command line arguements passed to executable. </param>
        static async Task Main(string[] args)
        {
            // Creates a Logger object from Serilog. Writes up to Debug level prints.
            SetupLogger();

            Log.Logger.Information("Darkweb Data Scraper start...");

            // Parses command line arguements and stores them in "parsedArgs"
            SetupParser(args);

            // I made this function to move this setup out of main.
            // Returns a TorSharpSettings object for use with TorSharp.
            var settings = SetupTorSharpSettings();

            // Idk exactly how this works but like... its for torsharp
            // its uh... setting up torsharp "tools"...also its asyncronous
            await SetupTorSharpTools(settings);

            // * starts tor proxy -----------------------------------------------------------------
            using (var proxy = new TorSharpProxy(settings))
            {
                var waiting = true;
                while (waiting)
                {
                    // block untill we wait for TorSharp Proxy to be configured
                    await proxy.ConfigureAndStartAsync();

                    waiting = false;
                }

                // * SETUP AND EXECUTE CRAWLER ================================================
                // Setup Crawler configuration
                CrawlConfigurationX crawlConfig = new CrawlConfigurationX
                {
                    MaxPagesToCrawl = 30,                               // Max total urls this crawler should crawl
                    MaxCrawlDepth   = 1,                                // Depth for crawler to traverse urls
                    IsJavascriptRenderingEnabled = false,               // Should crawler render JS?
                    JavascriptRenderingWaitTimeInMilliseconds = 2000,   // How long to wait for js to process
                    MaxConcurrentSiteCrawls = 1,                        // Only crawl a single site at a time
                    MaxRetryCount           = 3                         // Retries to connect and crawl site 'x' times
                };

                if (parsedArgs.InputFile == null) // THIS IS "-s" or "--single"
                {
                    var handler = new HttpClientHandler
                    {
                        Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port))
                    };

                    // Crawl
                    await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, parsedArgs.StartingUri);

                    BuildBsonDocument(DataScraper.allParsedText, parsedArgs.StartingUri);

                    //reset vals for next crawl
                    DataScraper.allParsedText = new List <string>();
                    DataScraper.siteTitle     = "";
                    DataScraper.dataDocuments = new List <BsonDocument>();

                    if (MONGO_URI == "")
                    {
                        Log.Logger.Information("Database information is no longer accessible or available." +
                                               "You will need to provide your own Mongo details in \"Crawler.cs\".");
                    }
                }
                else // THIS IS "-m" or "--multi"
                {
                    string inputFilePath = @parsedArgs.InputFile;

                    var sitesToCrawl = GenerateSiteList(inputFilePath);

                    for (int i = 0; i < sitesToCrawl.Count; i++)
                    {
                        var handler = new HttpClientHandler
                        {
                            Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port))
                        };

                        // Crawl
                        await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, sitesToCrawl[i]);

                        BuildBsonDocument(DataScraper.allParsedText, sitesToCrawl[i]);

                        if (MONGO_URI == "")
                        {
                            Log.Logger.Information("Database information is no longer accessible or available." +
                                                   "You will need to provide your own Mongo details in \"Crawler.cs\".");
                        }

                        if (MONGO_URI != "" && MONGO_COLLECTION_NAME != "" && MONGO_DATABASE_NAME != "")
                        {
                            var client     = new MongoClient(MONGO_URI);
                            var database   = client.GetDatabase(MONGO_DATABASE_NAME);
                            var collection = database.GetCollection <BsonDocument>(MONGO_COLLECTION_NAME);

                            collection.InsertMany(DataScraper.dataDocuments);
                        }

                        //reset vals for next crawl
                        DataScraper.allParsedText = new List <string>();
                        DataScraper.siteTitle     = "";
                        DataScraper.dataDocuments = new List <BsonDocument>();
                    }
                }
                // * ==========================================================================

                // Stop the TorSharp tools so that the proxy is no longer listening on the configured port.
                proxy.Stop();
            }
            // * ----------------------------------------------------------------------------------
        }