/// <summary> /// Checks if there are webpages waiting to be crawled in the queue. /// If there are no webpages, it creates the default one. /// </summary> private Task CheckIfQueueIsEmpty() { using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString)); if (isCrawling.IsCancellationRequested) { LogMessage("Canceled queue checking because the task is canceled.", DebugLevel.Error); return(Task.CompletedTask); } if (!dataHelper.Queue.Any()) { LogMessage("No webpages found to crawl. Adding the default one."); var dn = dataHelper.DomainNames.Add(new DomainName(new Uri(URL_TO_CRAWL_IF_THERE_IS_NO_ROWS).DnsSafeHost) { Priority = 1 }); var ur = dataHelper.Queue.Add(new UrlRecord(URL_TO_CRAWL_IF_THERE_IS_NO_ROWS, dn.Entity)); dn.Entity.AddUrlRecord(ur.Entity); dataHelper.SaveChanges(); config.IsFirstTime = true; LogMessage("`IsFirstTime` has changed to `true`."); } return(Task.CompletedTask); }
// Parses the URLs of this webpage. Task ParseUrls(HtmlNode doc, Uri url) { using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString)); try { var urls = doc.Descendants("a") .Select(x => x.GetAttributeValue("href", "").ToLower()) // Select all links of `a` tag .Where(x => !string.IsNullOrWhiteSpace(x)) // remove empty links .Select(x => new Uri(url, x)) // Convert it to a Uri type .Select(x => new Uri(x.GetComponents(UriComponents.HttpRequestUrl, UriFormat.Unescaped))) // format the links .Distinct() // Make it unique .Where(x => !(x.LocalPath.EndsWith(".js") || x.LocalPath.EndsWith(".css"))) // Don't include JS and CSS files .Where(x => !(dataHelper.Queue.Any(y => y.Url == x) || dataHelper.Index.Any(y => y.Url == x))) // Check if already exists .Select(x => { DomainName dn; // making sure there is no duplicates if (!dataHelper.DomainNames.Any(y => y.Domain == x.DnsSafeHost)) { dn = dataHelper.DomainNames.Add(new DomainName(x.DnsSafeHost)).Entity; dataHelper.SaveChanges(); } else { dn = dataHelper.DomainNames.First(y => y.Domain == x.DnsSafeHost); } var ur = new UrlRecord(x, dn); dn.AddUrlRecord(ur); return(ur); }); // Convert it to a UrlRecord type dataHelper.Queue.AddRange(urls); dataHelper.SaveChanges(); } catch (Exception e) { LogMessage($"Parsing URLs in `{url.AbsoluteUri}` didn't success. Error: {e.Message}", DebugLevel.Warning); } return(Task.CompletedTask); }
Task ParseKeywords(HtmlNode doc, Webpage webpage) { using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString)); #region Meta var titleKeywords = Regex.Matches(webpage?.Metadata?.Title ?? "", @"([\p{L}']+|\d+)") .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() }) .ToDictionary(x => x.Keyword, x => x.Count); var descKeywords = Regex.Matches(webpage?.Metadata?.Description ?? "", @"([\p{L}']+|\d+)") .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() }) .ToDictionary(x => x.Keyword, x => x.Count); var domainKeywords = Regex.Matches(Regex.Replace(webpage.Url.Authority, @"\..+$", ""), @"(\p{L}+|\d+)") .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() }) .ToDictionary(x => x.Keyword, x => x.Count); var urlKeywords = Regex.Matches(Regex.Replace(webpage.Url.LocalPath, @"\..+$", ""), @"(\p{L}+|\d+)") .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() }) .ToDictionary(x => x.Keyword, x => x.Count); var linksTasks = new Task[] { LinkWordsToWebpage(titleKeywords, webpage, docMetas["title"], dataHelper), LinkWordsToWebpage(descKeywords, webpage, docMetas["description"], dataHelper), LinkWordsToWebpage(domainKeywords, webpage, docMetas["domain"], dataHelper), LinkWordsToWebpage(urlKeywords, webpage, docMetas["url"], dataHelper) }; Task.WaitAll(linksTasks); #endregion #region Body Classes var body = doc.SelectSingleNode("//body"); // Remove style and script nodes. body.SelectNodes("//script")?.ToList().ForEach(x => x.Remove()); body.SelectNodes("//style")?.ToList().ForEach(x => x.Remove()); var sw = Stopwatch.StartNew(); Parallel.ForEach(docClasses.Keys, keyword => { var keywords = body.SelectNodes($"//{keyword}")? .Select(x => x.InnerText) //.Where(x => !string.IsNullOrWhiteSpace(x)) .SelectMany(x => Regex.Matches(x, @"([\p{L}']+|\d+)")).Select(x => x.Value) .Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x) .Select(x => new { Keyword = x.Key, Count = x.Count() }).ToDictionary(x => x.Keyword, x => x.Count); if (keywords != null) { LinkWordsToWebpage(keywords, webpage, docClasses[keyword], dataHelper, sw).Wait(); } }); Task.WaitAll(linksTasks); sw.Stop(); #endregion return(Task.CompletedTask); }
/// <summary> /// Crawling the web. /// </summary> private Task CrawlAsync() { #region Main Crawl Method var random = new Random(); if (isCrawling.IsCancellationRequested) { LogMessage("Canceled crawling because the task is canceled.", DebugLevel.Error); return(Task.CompletedTask); } using var http = new HttpClient(new HttpClientHandler() { AllowAutoRedirect = true, AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate, MaxAutomaticRedirections = 10, MaxRequestContentBufferSize = 100000, }) { Timeout = TimeSpan.FromSeconds(config.TimeoutInSeconds) }; #region Headers http.DefaultRequestHeaders.Add("User-Agent", $"Mozilla/5.0 (compatible; {config.UserAgent})"); http.DefaultRequestHeaders.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); http.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate"); http.DefaultRequestHeaders.Add("Accept-Language", "en-US, en-UK"); http.DefaultRequestHeaders.Add("Accept-Charset", "utf-16, utf-8"); http.DefaultRequestHeaders.Add("Connection", "keep-alive"); http.DefaultRequestHeaders.Add("Cache-Control", "no-cache"); #endregion Headers var tasksOfParsingKeywords = new List <Task>(config.MaxWaitForWebpages); while (true) { using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString)); if (isCrawling.IsCancellationRequested) { if (tasksOfParsingKeywords.Count > 0) { Task.WaitAll(tasksOfParsingKeywords.ToArray()); dataHelper.SaveChanges(); } LogMessage("Finished crawling."); return(Task.CompletedTask); } if (tasksOfParsingKeywords.Count >= config.MaxWaitForWebpages) { Task.WaitAny(tasksOfParsingKeywords.ToArray()); } var urlRecord = PopUrlRecord(); HttpResponseMessage res; try { res = http.GetAsync(urlRecord.Url, HttpCompletionOption.ResponseHeadersRead).Result; } catch (Exception e) { LogMessage($"Getting `{urlRecord.Url.AbsoluteUri}` didn't success. Error: {e.Message}", DebugLevel.Warning); continue; } if (res.IsSuccessStatusCode) { var url = res.RequestMessage.RequestUri; if (dataHelper.Index.Any(x => x.Url == url)) { continue; } try { var raw = res.Content.ReadAsStringAsync().Result; var doc = new HtmlDocument(); doc.LoadHtml(raw); var metadata = ParseMetadata(doc.DocumentNode); var webpage = new Webpage(url, metadata, urlRecord.Domain); ParseUrls(doc.DocumentNode, url); dataHelper.SaveChanges(); var pkTask = Task.Run(() => ParseKeywords(doc.DocumentNode, webpage).Wait()); pkTask.ContinueWith((x => tasksOfParsingKeywords.Remove(pkTask))); tasksOfParsingKeywords.Add(pkTask); } catch (Exception e) { LogMessage($"Crawling `{url.AbsoluteUri}` didn't finish successfully. Error: {e.Message}", DebugLevel.Warning); } } else { LogMessage($"Getting `{res.RequestMessage.RequestUri}` didn't success. Status Code: {res.StatusCode}", DebugLevel.Warning); } if (config.IsFirstTime) { isCrawling.Cancel(); LogMessage("Please restart the program AFTER the crawler will finished."); } } #endregion #region Local Methods // Gets a URL from the queue and removes it. UrlRecord PopUrlRecord() { using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString)); UrlRecord url = null; do { // Picking a domain name, with more chance for prioritized domain names (50% - 50%). var dns = random.Next(2) == 0 ? dataHelper.DomainNames.Where(x => x.UrlRecords.Any() && x.Priority == 0).OrderBy(x => Guid.NewGuid()).FirstOrDefault() : dataHelper.DomainNames.Where(x => x.UrlRecords.Any() && x.Priority != 0).OrderBy(x => Guid.NewGuid()).FirstOrDefault(); if (dns != null) { // Pick random URL from the domain name URLs list. url = dataHelper.Queue.Where(x => x.Domain.Id == dns.Id).FirstOrDefault(); } } while(url == null); // Remove the record from the queue. dataHelper.Queue.Remove(url); try { dataHelper.SaveChanges(); } catch { } return(url); }
private static async Task Main() { #region Config pathToProgramConfig = AppDomain.CurrentDomain.BaseDirectory + pathToProgramConfig; if (!File.Exists(pathToProgramConfig)) { var txt = JsonConvert.SerializeObject(new ProgramConfig(), Formatting.Indented); try { File.WriteAllText(pathToProgramConfig, txt); } catch (Exception e) { LogMaster($"Couldn't create or write to `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error); return; } LogMaster($"Created config at `{pathToProgramConfig}`. Please fill it and restart the program.", DebugLevel.Warning); return; } else { try { var txt = File.ReadAllText(pathToProgramConfig); config = JsonConvert.DeserializeObject <ProgramConfig>(txt); } catch (Exception e) { LogMaster($"Couldn't read or deserialize the config at `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error); return; } } LogMaster("Config serialized successfully.", DebugLevel.Info); #endregion #region Check Connection var(isSucceeded, isFirstTime) = CheckConnection(); if (!isSucceeded) { return; } if (isFirstTime) { config.NumberOfCrawlers = 1; config.Crawler_TimeoutForKeywordsParsingInMinutes = 1000000; // Don't timeout the process. // Setting case-sensitive to the keywords. using var dh = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString)); dh.Database.ExecuteSqlCommand( $"ALTER TABLE [{nameof(DataHelper.Keywords)}] ALTER COLUMN [{nameof(Keyword.RootKeywordForm)}] " + "nvarchar(64) COLLATE SQL_Latin1_General_CP1_CS_AS;"); } #endregion webCrawlers = new List <WebCrawler>(); for (int i = 0; i < config.NumberOfCrawlers; i++) { // Create a web crawler and assign config var wc = new WebCrawler(WebCrawlerConfig.Create( userAgent: config.Crawler_UserAgent, connectionString: config.Crawler_ConnectionString, maxWaitForWebpages: config.Crawler_MaxWaitForWebpages, timeoutInSeconds: config.Crawler_TimeoutInSeconds, timeoutForKeywordsParsingInMinutes: config.Crawler_TimeoutForKeywordsParsingInMinutes, id: i )); wc.Log += Log; await Task.Run(wc.StartAsync); webCrawlers.Add(wc); } // Don't close the application string cmd; do { cmd = Console.ReadLine().ToLower(); if (cmd == "stop all") { while (webCrawlers.Count > 0) { var wc = webCrawlers[0]; webCrawlers.Remove(wc); wc.StopAsync().Wait(); } } else { var match = Regex.Match(cmd, @"stop (\d+)"); if (match.Success) { var n = int.Parse(match.Groups[1].Value); for (int i = 0; i < n && i <= webCrawlers.Count; i++) { webCrawlers[i].StopAsync().Wait(); } } } } while(cmd != "exit"); }
/// <summary> /// Ensure that the connections to the database and the Internet exist. /// </summary> /// <returns>Returns the status if it succeeded and if it is the first time.</returns> private static (bool isSucceeded, bool isFirstTime) CheckConnection() { using var dataHelper = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString)); bool isFirstTime = false; LogMaster("Checking connections..."); // Check if there is connection to the database LogMaster("Starting checking database connection..."); try { var sw = Stopwatch.StartNew(); // Check if the database was created. if (dataHelper.Database.EnsureCreated()) { sw.Stop(); LogMaster($"The database was created in {sw.Elapsed.TotalSeconds:F3}s."); isFirstTime = true; } else { LogMaster("Found the database."); } LogMaster("Database connection checking completed successfully."); } catch (Exception e) { LogMaster($"Failed to connect to the database. Error: {e.Message}", DebugLevel.Error); return(false, isFirstTime); } // Check if there is connection to the Internet LogMaster("Starting to check Internet connection..."); var ping = new Ping(); var errors = 0; try { LogMaster("Pinging 8.8.8.8..."); var pr = ping.Send("8.8.8.8", 20000); if (pr.Status == IPStatus.Success) { LogMaster($"Connection to 8.8.8.8 completed successfully in {pr.RoundtripTime}ms."); } else { LogMaster($"Connection to 8.8.8.8 completed with an error ({pr.Status}).", DebugLevel.Warning); errors++; } LogMaster("Pinging 8.8.4.4..."); pr = ping.Send("8.8.4.4", 20000); if (pr.Status == IPStatus.Success) { LogMaster($"Connection to 8.8.4.4 completed successfully in {pr.RoundtripTime}ms."); } else { LogMaster($"Connection to 8.8.4.4 completed with an error ({pr.Status}).", DebugLevel.Warning); errors++; } } catch (PingException e) { LogMaster($"Failed to connect to the Internet. Error: {e.Message}", DebugLevel.Error); return(false, isFirstTime); } finally { ping.Dispose(); } if (errors > 1) { LogMaster($"Failed to connect to the Internet because too many connection errors.", DebugLevel.Error); return(false, isFirstTime); } LogMaster("Internet connection checking completed successfully."); LogMaster("Connections checking completed successfully."); return(true, isFirstTime); }