Beispiel #1
0
        /// <summary>
        /// Checks if there are webpages waiting to be crawled in the queue.
        /// If there are no webpages, it creates the default one.
        /// </summary>
        private Task CheckIfQueueIsEmpty()
        {
            using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString));

            if (isCrawling.IsCancellationRequested)
            {
                LogMessage("Canceled queue checking because the task is canceled.", DebugLevel.Error);
                return(Task.CompletedTask);
            }

            if (!dataHelper.Queue.Any())
            {
                LogMessage("No webpages found to crawl. Adding the default one.");
                var dn = dataHelper.DomainNames.Add(new DomainName(new Uri(URL_TO_CRAWL_IF_THERE_IS_NO_ROWS).DnsSafeHost)
                {
                    Priority = 1
                });
                var ur = dataHelper.Queue.Add(new UrlRecord(URL_TO_CRAWL_IF_THERE_IS_NO_ROWS, dn.Entity));
                dn.Entity.AddUrlRecord(ur.Entity);

                dataHelper.SaveChanges();

                config.IsFirstTime = true;
                LogMessage("`IsFirstTime` has changed to `true`.");
            }

            return(Task.CompletedTask);
        }
Beispiel #2
0
            // Parses the URLs of this webpage.
            Task ParseUrls(HtmlNode doc, Uri url)
            {
                using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString));

                try
                {
                    var urls = doc.Descendants("a")
                               .Select(x => x.GetAttributeValue("href", "").ToLower())                                        // Select all links of `a` tag
                               .Where(x => !string.IsNullOrWhiteSpace(x))                                                     // remove empty links
                               .Select(x => new Uri(url, x))                                                                  // Convert it to a Uri type
                               .Select(x => new Uri(x.GetComponents(UriComponents.HttpRequestUrl, UriFormat.Unescaped)))      // format the links
                               .Distinct()                                                                                    // Make it unique
                               .Where(x => !(x.LocalPath.EndsWith(".js") || x.LocalPath.EndsWith(".css")))                    // Don't include JS and CSS files
                               .Where(x => !(dataHelper.Queue.Any(y => y.Url == x) || dataHelper.Index.Any(y => y.Url == x))) // Check if already exists
                               .Select(x =>
                    {
                        DomainName dn;
                        // making sure there is no duplicates
                        if (!dataHelper.DomainNames.Any(y => y.Domain == x.DnsSafeHost))
                        {
                            dn = dataHelper.DomainNames.Add(new DomainName(x.DnsSafeHost)).Entity;
                            dataHelper.SaveChanges();
                        }
                        else
                        {
                            dn = dataHelper.DomainNames.First(y => y.Domain == x.DnsSafeHost);
                        }
                        var ur = new UrlRecord(x, dn);
                        dn.AddUrlRecord(ur);
                        return(ur);
                    });     // Convert it to a UrlRecord type

                    dataHelper.Queue.AddRange(urls);
                    dataHelper.SaveChanges();
                }
                catch (Exception e)
                {
                    LogMessage($"Parsing URLs in `{url.AbsoluteUri}` didn't success. Error: {e.Message}", DebugLevel.Warning);
                }
                return(Task.CompletedTask);
            }
Beispiel #3
0
            Task ParseKeywords(HtmlNode doc, Webpage webpage)
            {
                using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString));

                #region Meta

                var titleKeywords = Regex.Matches(webpage?.Metadata?.Title ?? "", @"([\p{L}']+|\d+)")
                                    .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() })
                                    .ToDictionary(x => x.Keyword, x => x.Count);
                var descKeywords = Regex.Matches(webpage?.Metadata?.Description ?? "", @"([\p{L}']+|\d+)")
                                   .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() })
                                   .ToDictionary(x => x.Keyword, x => x.Count);

                var domainKeywords = Regex.Matches(Regex.Replace(webpage.Url.Authority, @"\..+$", ""), @"(\p{L}+|\d+)")
                                     .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() })
                                     .ToDictionary(x => x.Keyword, x => x.Count);
                var urlKeywords = Regex.Matches(Regex.Replace(webpage.Url.LocalPath, @"\..+$", ""), @"(\p{L}+|\d+)")
                                  .Select(x => x.Value).Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x).Select(x => new { Keyword = x.Key, Count = x.Count() })
                                  .ToDictionary(x => x.Keyword, x => x.Count);

                var linksTasks = new Task[]
                {
                    LinkWordsToWebpage(titleKeywords, webpage, docMetas["title"], dataHelper),
                    LinkWordsToWebpage(descKeywords, webpage, docMetas["description"], dataHelper),
                    LinkWordsToWebpage(domainKeywords, webpage, docMetas["domain"], dataHelper),
                    LinkWordsToWebpage(urlKeywords, webpage, docMetas["url"], dataHelper)
                };

                Task.WaitAll(linksTasks);

                #endregion

                #region Body Classes

                var body = doc.SelectSingleNode("//body");

                // Remove style and script nodes.
                body.SelectNodes("//script")?.ToList().ForEach(x => x.Remove());
                body.SelectNodes("//style")?.ToList().ForEach(x => x.Remove());

                var sw = Stopwatch.StartNew();

                Parallel.ForEach(docClasses.Keys, keyword =>
                {
                    var keywords = body.SelectNodes($"//{keyword}")?
                                   .Select(x => x.InnerText)
                                   //.Where(x => !string.IsNullOrWhiteSpace(x))
                                   .SelectMany(x => Regex.Matches(x, @"([\p{L}']+|\d+)")).Select(x => x.Value)
                                   .Where(x => !string.IsNullOrWhiteSpace(x)).GroupBy(x => x)
                                   .Select(x => new { Keyword = x.Key, Count = x.Count() }).ToDictionary(x => x.Keyword, x => x.Count);
                    if (keywords != null)
                    {
                        LinkWordsToWebpage(keywords, webpage, docClasses[keyword], dataHelper, sw).Wait();
                    }
                });

                Task.WaitAll(linksTasks);
                sw.Stop();

                #endregion

                return(Task.CompletedTask);
            }
Beispiel #4
0
        /// <summary>
        /// Crawling the web.
        /// </summary>
        private Task CrawlAsync()
        {
            #region Main Crawl Method

            var random = new Random();

            if (isCrawling.IsCancellationRequested)
            {
                LogMessage("Canceled crawling because the task is canceled.", DebugLevel.Error);
                return(Task.CompletedTask);
            }

            using var http = new HttpClient(new HttpClientHandler()
            {
                AllowAutoRedirect           = true,
                AutomaticDecompression      = DecompressionMethods.GZip | DecompressionMethods.Deflate,
                MaxAutomaticRedirections    = 10,
                MaxRequestContentBufferSize = 100000,
            })
                  {
                      Timeout = TimeSpan.FromSeconds(config.TimeoutInSeconds)
                  };

            #region Headers

            http.DefaultRequestHeaders.Add("User-Agent", $"Mozilla/5.0 (compatible; {config.UserAgent})");
            http.DefaultRequestHeaders.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
            http.DefaultRequestHeaders.Add("Accept-Encoding", "gzip, deflate");
            http.DefaultRequestHeaders.Add("Accept-Language", "en-US, en-UK");
            http.DefaultRequestHeaders.Add("Accept-Charset", "utf-16, utf-8");
            http.DefaultRequestHeaders.Add("Connection", "keep-alive");
            http.DefaultRequestHeaders.Add("Cache-Control", "no-cache");

            #endregion Headers

            var tasksOfParsingKeywords = new List <Task>(config.MaxWaitForWebpages);

            while (true)
            {
                using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString));

                if (isCrawling.IsCancellationRequested)
                {
                    if (tasksOfParsingKeywords.Count > 0)
                    {
                        Task.WaitAll(tasksOfParsingKeywords.ToArray());
                        dataHelper.SaveChanges();
                    }
                    LogMessage("Finished crawling.");
                    return(Task.CompletedTask);
                }

                if (tasksOfParsingKeywords.Count >= config.MaxWaitForWebpages)
                {
                    Task.WaitAny(tasksOfParsingKeywords.ToArray());
                }

                var urlRecord = PopUrlRecord();

                HttpResponseMessage res;
                try
                {
                    res = http.GetAsync(urlRecord.Url, HttpCompletionOption.ResponseHeadersRead).Result;
                }
                catch (Exception e)
                {
                    LogMessage($"Getting `{urlRecord.Url.AbsoluteUri}` didn't success. Error: {e.Message}", DebugLevel.Warning);
                    continue;
                }

                if (res.IsSuccessStatusCode)
                {
                    var url = res.RequestMessage.RequestUri;

                    if (dataHelper.Index.Any(x => x.Url == url))
                    {
                        continue;
                    }

                    try
                    {
                        var raw = res.Content.ReadAsStringAsync().Result;
                        var doc = new HtmlDocument();
                        doc.LoadHtml(raw);

                        var metadata = ParseMetadata(doc.DocumentNode);
                        var webpage  = new Webpage(url, metadata, urlRecord.Domain);

                        ParseUrls(doc.DocumentNode, url);

                        dataHelper.SaveChanges();

                        var pkTask = Task.Run(() => ParseKeywords(doc.DocumentNode, webpage).Wait());
                        pkTask.ContinueWith((x => tasksOfParsingKeywords.Remove(pkTask)));
                        tasksOfParsingKeywords.Add(pkTask);
                    }
                    catch (Exception e)
                    {
                        LogMessage($"Crawling `{url.AbsoluteUri}` didn't finish successfully. Error: {e.Message}", DebugLevel.Warning);
                    }
                }
                else
                {
                    LogMessage($"Getting `{res.RequestMessage.RequestUri}` didn't success. Status Code: {res.StatusCode}", DebugLevel.Warning);
                }

                if (config.IsFirstTime)
                {
                    isCrawling.Cancel();
                    LogMessage("Please restart the program AFTER the crawler will finished.");
                }
            }

            #endregion

            #region Local Methods

            // Gets a URL from the queue and removes it.
            UrlRecord PopUrlRecord()
            {
                using var dataHelper = new DataHelper(DataHelperConfig.Create(config.ConnectionString));
                UrlRecord url = null;

                do
                {
                    // Picking a domain name, with more chance for prioritized domain names (50% - 50%).
                    var dns = random.Next(2) == 0 ?
                              dataHelper.DomainNames.Where(x => x.UrlRecords.Any() && x.Priority == 0).OrderBy(x => Guid.NewGuid()).FirstOrDefault() :
                              dataHelper.DomainNames.Where(x => x.UrlRecords.Any() && x.Priority != 0).OrderBy(x => Guid.NewGuid()).FirstOrDefault();

                    if (dns != null)
                    {
                        // Pick random URL from the domain name URLs list.
                        url = dataHelper.Queue.Where(x => x.Domain.Id == dns.Id).FirstOrDefault();
                    }
                } while(url == null);

                // Remove the record from the queue.
                dataHelper.Queue.Remove(url);
                try
                {
                    dataHelper.SaveChanges();
                }
                catch { }

                return(url);
            }
Beispiel #5
0
        private static async Task Main()
        {
            #region Config

            pathToProgramConfig = AppDomain.CurrentDomain.BaseDirectory + pathToProgramConfig;

            if (!File.Exists(pathToProgramConfig))
            {
                var txt = JsonConvert.SerializeObject(new ProgramConfig(), Formatting.Indented);
                try
                {
                    File.WriteAllText(pathToProgramConfig, txt);
                }
                catch (Exception e)
                {
                    LogMaster($"Couldn't create or write to `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error);
                    return;
                }
                LogMaster($"Created config at `{pathToProgramConfig}`. Please fill it and restart the program.", DebugLevel.Warning);
                return;
            }
            else
            {
                try
                {
                    var txt = File.ReadAllText(pathToProgramConfig);
                    config = JsonConvert.DeserializeObject <ProgramConfig>(txt);
                }
                catch (Exception e)
                {
                    LogMaster($"Couldn't read or deserialize the config at `{pathToProgramConfig}`. Error: {e.Message}", DebugLevel.Error);
                    return;
                }
            }

            LogMaster("Config serialized successfully.", DebugLevel.Info);

            #endregion

            #region Check Connection

            var(isSucceeded, isFirstTime) = CheckConnection();

            if (!isSucceeded)
            {
                return;
            }
            if (isFirstTime)
            {
                config.NumberOfCrawlers = 1;
                config.Crawler_TimeoutForKeywordsParsingInMinutes = 1000000; // Don't timeout the process.
                // Setting case-sensitive to the keywords.
                using var dh = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString));
                dh.Database.ExecuteSqlCommand(
                    $"ALTER TABLE [{nameof(DataHelper.Keywords)}] ALTER COLUMN [{nameof(Keyword.RootKeywordForm)}] " +
                    "nvarchar(64) COLLATE SQL_Latin1_General_CP1_CS_AS;");
            }

            #endregion

            webCrawlers = new List <WebCrawler>();

            for (int i = 0; i < config.NumberOfCrawlers; i++)
            {
                // Create a web crawler and assign config
                var wc = new WebCrawler(WebCrawlerConfig.Create(
                                            userAgent: config.Crawler_UserAgent,
                                            connectionString: config.Crawler_ConnectionString,
                                            maxWaitForWebpages: config.Crawler_MaxWaitForWebpages,
                                            timeoutInSeconds: config.Crawler_TimeoutInSeconds,
                                            timeoutForKeywordsParsingInMinutes: config.Crawler_TimeoutForKeywordsParsingInMinutes,
                                            id: i
                                            ));

                wc.Log += Log;
                await Task.Run(wc.StartAsync);

                webCrawlers.Add(wc);
            }

            // Don't close the application
            string cmd;
            do
            {
                cmd = Console.ReadLine().ToLower();
                if (cmd == "stop all")
                {
                    while (webCrawlers.Count > 0)
                    {
                        var wc = webCrawlers[0];
                        webCrawlers.Remove(wc);
                        wc.StopAsync().Wait();
                    }
                }
                else
                {
                    var match = Regex.Match(cmd, @"stop (\d+)");
                    if (match.Success)
                    {
                        var n = int.Parse(match.Groups[1].Value);
                        for (int i = 0; i < n && i <= webCrawlers.Count; i++)
                        {
                            webCrawlers[i].StopAsync().Wait();
                        }
                    }
                }
            } while(cmd != "exit");
        }
Beispiel #6
0
        /// <summary>
        /// Ensure that the connections to the database and the Internet exist.
        /// </summary>
        /// <returns>Returns the status if it succeeded and if it is the first time.</returns>
        private static (bool isSucceeded, bool isFirstTime) CheckConnection()
        {
            using var dataHelper = new DataHelper(DataHelperConfig.Create(config.Crawler_ConnectionString));
            bool isFirstTime = false;

            LogMaster("Checking connections...");

            // Check if there is connection to the database
            LogMaster("Starting checking database connection...");

            try
            {
                var sw = Stopwatch.StartNew();

                // Check if the database was created.
                if (dataHelper.Database.EnsureCreated())
                {
                    sw.Stop();
                    LogMaster($"The database was created in {sw.Elapsed.TotalSeconds:F3}s.");
                    isFirstTime = true;
                }
                else
                {
                    LogMaster("Found the database.");
                }

                LogMaster("Database connection checking completed successfully.");
            }
            catch (Exception e)
            {
                LogMaster($"Failed to connect to the database. Error: {e.Message}", DebugLevel.Error);
                return(false, isFirstTime);
            }

            // Check if there is connection to the Internet
            LogMaster("Starting to check Internet connection...");
            var ping   = new Ping();
            var errors = 0;

            try
            {
                LogMaster("Pinging 8.8.8.8...");
                var pr = ping.Send("8.8.8.8", 20000);
                if (pr.Status == IPStatus.Success)
                {
                    LogMaster($"Connection to 8.8.8.8 completed successfully in {pr.RoundtripTime}ms.");
                }
                else
                {
                    LogMaster($"Connection to 8.8.8.8 completed with an error ({pr.Status}).", DebugLevel.Warning);
                    errors++;
                }

                LogMaster("Pinging 8.8.4.4...");
                pr = ping.Send("8.8.4.4", 20000);
                if (pr.Status == IPStatus.Success)
                {
                    LogMaster($"Connection to 8.8.4.4 completed successfully in {pr.RoundtripTime}ms.");
                }
                else
                {
                    LogMaster($"Connection to 8.8.4.4 completed with an error ({pr.Status}).", DebugLevel.Warning);
                    errors++;
                }
            }
            catch (PingException e)
            {
                LogMaster($"Failed to connect to the Internet. Error: {e.Message}", DebugLevel.Error);
                return(false, isFirstTime);
            }
            finally
            {
                ping.Dispose();
            }

            if (errors > 1)
            {
                LogMaster($"Failed to connect to the Internet because too many connection errors.", DebugLevel.Error);
                return(false, isFirstTime);
            }

            LogMaster("Internet connection checking completed successfully.");

            LogMaster("Connections checking completed successfully.");
            return(true, isFirstTime);
        }