protected override List <string> ProcessUrlAndGetLinks(WebClient webClient, string current_url, int processId) { //if (progress.Counter > 1000) //{ // return new List<string>(); //} try { if (progress.Counter % 10 == 0) { Console.WriteLine("progress.Counter = {0}, progress.CountPerSecond = {1}", progress.Counter, progress.CountPerSecond()); } progress.Increment(); //Console.WriteLine("\n({0}, {1}) ID={3} download {2}", visited_urls.Count, urls_to_visit.Count, current_url, processId); string html = webClient.DownloadString(current_url); HtmlDocument document = new HtmlDocument(); document.LoadHtml(html); HtmlNode root = document.DocumentNode; List <string> links = root.SelectNodes("//a[@href]") .Select(el => normalize_url(current_url, el.GetAttributeValue("href", null))) .ToList(); List <string> filtered_links = links.Where(good_url).ToList(); IAppDataUS app = null; try // parse app { Console.WriteLine(current_url); app = new IAppDataUS(document, current_url); //\\ lock (apps) { apps.Add(app); } //Console.WriteLine(app.name + " " + app.url); } catch (Exception e) { AddException(e, current_url); } save_page_to_file(root, html, app); return(filtered_links); } catch (Exception e) { AddException(e, current_url); return(new List <string>()); } }
protected void save_page_to_file(HtmlNode root, string html, IAppDataUS app) { string addition = ""; if (app != null && app.url.StartsWith(iosGenreFilter)) { addition = "GENRE:"; } else if (app != null && app.url.StartsWith(iosAppFilter)) { addition = "APP:"; } int doc_id = Interlocked.Increment(ref doc_id_counter); string file_name = doc_id + " " + addition + " " + (app == null ? "" : app.name.MakeFileName()) + ".htm"; string file_path = Path.Combine(path, file_name); File.WriteAllText(file_path, html, Encoding.Default); }
protected override void OnFinished() { IAppDataUS.save_to_file(apps, _aggregatedDataFileName); SaveExceptions(_errorsFileName, exceptions); }