static void AggregeteFolder(string folderPath, string fileName, string errorsFileName)
        {
            ProgressCounter  progress   = new ProgressCounter();
            List <AppData>   apps       = new List <AppData>();
            List <Exception> exceptions = new List <Exception>();

            foreach (string app_file_name in Directory.EnumerateFiles(folderPath, "*.htm"))
            {
                progress.LogEvery(10);
                progress.Increment();

                try
                {
                    HtmlDocument document = new HtmlDocument();
                    document.LoadHtml(File.ReadAllText(app_file_name));
                    AppData appData = new AppData(document, "");
                    apps.Add(appData);
                }
                catch (Exception e)
                {
                    e.Data["path"] = app_file_name;
                    exceptions.Add(e);
                    Console.WriteLine("errors: {0}", exceptions.Count());
                }
            }

            MultithreadedCrawler.SaveExceptions(errorsFileName, exceptions);
            AppData.save_to_file(apps, fileName);
        }
示例#2
0
        protected override List <string> ProcessUrlAndGetLinks(WebClient webClient, string current_url, int processId)
        {
            progress.Increment();
            progress.LogEvery(100);
            //Console.WriteLine("\n({0}, {1}) ID={3} download   {2}", visited_urls.Count, urls_to_visit.Count, current_url, processId);
            string        stage          = null;
            string        html           = null;
            List <string> filtered_links = null;

            try
            {
                stage = "downloading";
                html  = webClient.DownloadString(current_url);
                HtmlDocument document = new HtmlDocument();

                stage = "loading_html_document";
                document.LoadHtml(html);
                HtmlNode root = document.DocumentNode;

                stage = "get_links";
                List <string> links =
                    root.SelectNodes("//a[@href]")
                    .Select(el => normalize_url(current_url, el.GetAttributeValue("href", null)))
                    .ToList();
                filtered_links = links.Where(good_url).ToList();

                if (AppData.IsAppUrl(current_url))
                {
                    stage = "parse_app";
                    AppData app = new AppData(document, current_url);
                    lock (apps)
                    {
                        apps.Add(app);
                    }
                    SavePage(_appsPath, html, app.Id);
                    stage = "parse_youtube_page";
                    ProcessVideoId(webClient, app.videoID);
                }
                else if (DeveloperData.IsDeveloperUrl(current_url))
                {
                    stage = "parse_developer";
                    DeveloperData developer = new DeveloperData(document, current_url);
                    lock (developers)
                    {
                        developers.Add(developer);
                    }
                    SavePage(_developersPath, html, developer.name);
                }
                else
                {
                    SavePage(_otherPagesPath, html);
                }
            }
            catch (Exception e)
            {
                AddException(e, current_url, stage);
                SaveFaultyPage(html, stage, e);
            }

            return(filtered_links ?? new List <string>());
        }