private void btnStartCrawling_Click(object sender, RoutedEventArgs e) { Directory.CreateDirectory(sourceFilesDirectory); if (File.Exists(srCrawledLinksPath)) { foreach (var item in File.ReadLines(srCrawledLinksPath)) { hsCrawledLinks.Add(item); } } if (File.Exists(srLinksDiscoveredPath)) { foreach (var item in File.ReadLines(srLinksDiscoveredPath)) { if (hsCrawledLinks.Contains(item)) { continue; } hsDiscoveredLinks.Add(item); } } HTTPDownloader.readFailedUrls(ref dicFailedUrls); hsDiscoveredLinks.Add(txtRootUrl.Text); while (true) { if (hsDiscoveredLinks.Count == 0) { break; } string srNextUrl = ""; List <string> lstRemove = new List <string>(); foreach (var item in hsDiscoveredLinks) { if (dicFailedUrls.ContainsKey(item.func_GenerateURLHash())) { if (dicFailedUrls[item.func_GenerateURLHash()].irFailCount >= 3) { if (dicFailedUrls[item.func_GenerateURLHash()].dtPause.AddHours(24) > DateTime.Now) { continue; } else { dicFailedUrls.Remove(item.func_GenerateURLHash()); } } } if (hsCrawledLinks.Contains(item)) { lstRemove.Add(item); } else { srNextUrl = item; break; } } foreach (var item in lstRemove) { hsDiscoveredLinks.Remove(item); } if (srNextUrl.Length == 0) { break; } crawlGivenURL(srNextUrl); } }
private static void crawlGivenURL(string srCrawlURL) { string srUrlHash = srCrawlURL.func_GenerateURLHash(); string srDownloadedFileSaveName = sourceFilesDirectory + "/" + srUrlHash + ".txt"; string srBaseUrl = srCrawlURL; HTTPDownloader.WebPageDownloadResult myDownloadResult = new HTTPDownloader.WebPageDownloadResult(); if (File.Exists(srDownloadedFileSaveName)) { myDownloadResult.srCrawledPageSource = File.ReadAllText(srDownloadedFileSaveName); } else { myDownloadResult = HTTPDownloader.FuncCrawlGivenURL(srBaseUrl); if (myDownloadResult.occuredException != null) { File.AppendAllText("errors.txt", srBaseUrl + "\r\n" + myDownloadResult.occuredException.StackTrace + "\r\n\r\n\r\n"); if (dicFailedUrls.ContainsKey(srBaseUrl.func_GenerateURLHash())) { dicFailedUrls[srBaseUrl.func_GenerateURLHash()].irFailCount++; dicFailedUrls[srBaseUrl.func_GenerateURLHash()].dtPause = DateTime.Now; } else { dicFailedUrls.Add(srBaseUrl.func_GenerateURLHash(), new HTTPDownloader.csUrlFails { dtPause = DateTime.Now, irFailCount = 1, srUrl = srBaseUrl }); } HTTPDownloader.writeFailedUrlsToFile(dicFailedUrls); } if (myDownloadResult.httpStatusResult == System.Net.HttpStatusCode.OK) { hsCrawledLinks.Add(srBaseUrl); File.AppendAllText(srCrawledLinksPath, srBaseUrl + "\r\n");//we add to our crawled url database the new successfully crawled url File.WriteAllText(srDownloadedFileSaveName, myDownloadResult.srCrawledPageSource); } } HtmlDocument hdDoc = new HtmlDocument(); hdDoc.LoadHtml(myDownloadResult.srCrawledPageSource); var links = hdDoc.DocumentNode.SelectNodes("//a"); List <string> lstDiscoveredLinks = new List <string>(); if (links != null) { foreach (var vrNode in links) { if (vrNode.Attributes["href"] != null) { string srNewAbsLink = vrNode.Attributes["href"].Value.ToString(); srNewAbsLink = HTTPDownloader.ReturnAbsUrl(srBaseUrl, srNewAbsLink, "toros.edu.tr"); if (srNewAbsLink == null) { continue; } bool blByPass = false; foreach (var item in HTTPDownloader.lstIgnore) { if (srNewAbsLink.ToLowerInvariant().EndsWith(item)) { blByPass = true; break; } } //equals to above foreach if (HTTPDownloader.lstIgnore.Where(pr => srNewAbsLink.EndsWith(pr) == true).ToList().Count > 0) { continue; } if (blByPass == true) { continue; } lstDiscoveredLinks.Add(srNewAbsLink); hsDiscoveredLinks.Add(srNewAbsLink); Debug.WriteLine(srNewAbsLink); } } } lstDiscoveredLinks = lstDiscoveredLinks.Distinct().ToList(); File.AppendAllLines(srLinksDiscoveredPath, lstDiscoveredLinks); }