// ======================================================================================== static void BuildBsonDocument(List <string> parsedText, string crawledURL) { List <BsonDocument> dataDocuments = new List <BsonDocument>(); // Dictionary containing keywords desired, and a list of all contexts in which they were used Dictionary <string, Pair <int, List <string> > > contextCache = DataScraper.GetWordCountAndContext(parsedText, Constants.DefaultIgnoreWordsTXT); //make list from dict to sort var dictList = contextCache.ToList(); // Sort takes a comparison operator // Comparison(x,y) -> less than 0 if x < y, 0 if equal, greater than 0 if x > y // for all keyValuePairs in dict, sort based on the frequency count // pair: word : list of dictList.Sort((pair1, pair2) => pair1.Value.Item1 > pair2.Value.Item1 ? -1 : 1); var sentimentAnalysis = new BsonDocument(); var numWords = dictList.Count > 100 ? 100 : dictList.Count; for (int i = 0; i < numWords; i++) { //Log.Logger.Debug("Getting Context words for " + dictList[i].Key); if (dictList[i].Key == "") { continue; // Skips the stupid empty string keyword problem we havn't fixed yet... } // Excludes words we don't care about var desiredWords = DataScraper.ExcludeWords(dictList[i].Value.Item2); //the context sentences //number of occurances of context words for a given keyword var contextWordCount = DataScraper.GetWordCount(desiredWords); sentimentAnalysis.Add(new BsonElement( dictList[i].Key, new BsonDocument { { "Count", dictList[i].Value.Item1 }, { "ContextSentences", new BsonArray(dictList[i].Value.Item2) }, { "ContextWordFrequency", new BsonDocument(contextWordCount) } } )); } // BSON doc var bson = new BsonDocument { { "WebsiteTitle", DataScraper.siteTitle }, { "URL", crawledURL }, // {"Raw", rawPageText}, { "SentimentAnalysis", sentimentAnalysis } }; if (bson != null) { DataScraper.dataDocuments.Add(bson); } }
// PRIVATE CLASS MEMBERS // MAIN =================================================================================== /// <param name="args"> Command line arguements passed to executable. </param> static async Task Main(string[] args) { // Creates a Logger object from Serilog. Writes up to Debug level prints. SetupLogger(); Log.Logger.Information("Darkweb Data Scraper start..."); // Parses command line arguements and stores them in "parsedArgs" SetupParser(args); // I made this function to move this setup out of main. // Returns a TorSharpSettings object for use with TorSharp. var settings = SetupTorSharpSettings(); // Idk exactly how this works but like... its for torsharp // its uh... setting up torsharp "tools"...also its asyncronous await SetupTorSharpTools(settings); // * starts tor proxy ----------------------------------------------------------------- using (var proxy = new TorSharpProxy(settings)) { var waiting = true; while (waiting) { // block untill we wait for TorSharp Proxy to be configured await proxy.ConfigureAndStartAsync(); waiting = false; } // * SETUP AND EXECUTE CRAWLER ================================================ // Setup Crawler configuration CrawlConfigurationX crawlConfig = new CrawlConfigurationX { MaxPagesToCrawl = 30, // Max total urls this crawler should crawl MaxCrawlDepth = 1, // Depth for crawler to traverse urls IsJavascriptRenderingEnabled = false, // Should crawler render JS? JavascriptRenderingWaitTimeInMilliseconds = 2000, // How long to wait for js to process MaxConcurrentSiteCrawls = 1, // Only crawl a single site at a time MaxRetryCount = 3 // Retries to connect and crawl site 'x' times }; if (parsedArgs.InputFile == null) // THIS IS "-s" or "--single" { var handler = new HttpClientHandler { Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port)) }; // Crawl await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, parsedArgs.StartingUri); BuildBsonDocument(DataScraper.allParsedText, parsedArgs.StartingUri); //reset vals for next crawl DataScraper.allParsedText = new List <string>(); DataScraper.siteTitle = ""; DataScraper.dataDocuments = new List <BsonDocument>(); if (MONGO_URI == "") { Log.Logger.Information("Database information is no longer accessible or available." + "You will need to provide your own Mongo details in \"Crawler.cs\"."); } } else // THIS IS "-m" or "--multi" { string inputFilePath = @parsedArgs.InputFile; var sitesToCrawl = GenerateSiteList(inputFilePath); for (int i = 0; i < sitesToCrawl.Count; i++) { var handler = new HttpClientHandler { Proxy = new WebProxy(new Uri("http://localhost:" + settings.PrivoxySettings.Port)) }; // Crawl await DataScraper.Crawl(crawlConfig, handler, parsedArgs.handlerType, sitesToCrawl[i]); BuildBsonDocument(DataScraper.allParsedText, sitesToCrawl[i]); if (MONGO_URI == "") { Log.Logger.Information("Database information is no longer accessible or available." + "You will need to provide your own Mongo details in \"Crawler.cs\"."); } if (MONGO_URI != "" && MONGO_COLLECTION_NAME != "" && MONGO_DATABASE_NAME != "") { var client = new MongoClient(MONGO_URI); var database = client.GetDatabase(MONGO_DATABASE_NAME); var collection = database.GetCollection <BsonDocument>(MONGO_COLLECTION_NAME); collection.InsertMany(DataScraper.dataDocuments); } //reset vals for next crawl DataScraper.allParsedText = new List <string>(); DataScraper.siteTitle = ""; DataScraper.dataDocuments = new List <BsonDocument>(); } } // * ========================================================================== // Stop the TorSharp tools so that the proxy is no longer listening on the configured port. proxy.Stop(); } // * ---------------------------------------------------------------------------------- }