static void AggregeteFolder(string folderPath, string fileName, string errorsFileName) { ProgressCounter progress = new ProgressCounter(); List <AppData> apps = new List <AppData>(); List <Exception> exceptions = new List <Exception>(); foreach (string app_file_name in Directory.EnumerateFiles(folderPath, "*.htm")) { progress.LogEvery(10); progress.Increment(); try { HtmlDocument document = new HtmlDocument(); document.LoadHtml(File.ReadAllText(app_file_name)); AppData appData = new AppData(document, ""); apps.Add(appData); } catch (Exception e) { e.Data["path"] = app_file_name; exceptions.Add(e); Console.WriteLine("errors: {0}", exceptions.Count()); } } MultithreadedCrawler.SaveExceptions(errorsFileName, exceptions); AppData.save_to_file(apps, fileName); }
protected override List <string> ProcessUrlAndGetLinks(WebClient webClient, string current_url, int processId) { progress.Increment(); progress.LogEvery(100); //Console.WriteLine("\n({0}, {1}) ID={3} download {2}", visited_urls.Count, urls_to_visit.Count, current_url, processId); string stage = null; string html = null; List <string> filtered_links = null; try { stage = "downloading"; html = webClient.DownloadString(current_url); HtmlDocument document = new HtmlDocument(); stage = "loading_html_document"; document.LoadHtml(html); HtmlNode root = document.DocumentNode; stage = "get_links"; List <string> links = root.SelectNodes("//a[@href]") .Select(el => normalize_url(current_url, el.GetAttributeValue("href", null))) .ToList(); filtered_links = links.Where(good_url).ToList(); if (AppData.IsAppUrl(current_url)) { stage = "parse_app"; AppData app = new AppData(document, current_url); lock (apps) { apps.Add(app); } SavePage(_appsPath, html, app.Id); stage = "parse_youtube_page"; ProcessVideoId(webClient, app.videoID); } else if (DeveloperData.IsDeveloperUrl(current_url)) { stage = "parse_developer"; DeveloperData developer = new DeveloperData(document, current_url); lock (developers) { developers.Add(developer); } SavePage(_developersPath, html, developer.name); } else { SavePage(_otherPagesPath, html); } } catch (Exception e) { AddException(e, current_url, stage); SaveFaultyPage(html, stage, e); } return(filtered_links ?? new List <string>()); }