ScrapeCriteriaBuilder, WebScraper C# (CSharp)代码示例

示例#1

0

显示文件

        static void Main(string[] args)
        {
            // SCRAPING PROCESS
            // Download page content
            // Use regular expressions & builder pattern to create search criteria - search through elements we want in HTML
            // scrape

            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from:");
                var craigslistCity = Console.ReadLine() ?? string.Empty;
                Console.WriteLine("Please enter the CraigsList category that you would like to scrape:");
                var craigslistCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string         content        = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/sss?query={craigslistCategoryName}");
                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") // regex to pick out specific things from html element <a>
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>") // specifying parts in the link - using builder within a builder
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build(); // Now building all of it


                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }

示例#2

0

显示文件

        static void Main(string[] args)
        {
            try
            {
                Console.Write("Enter the city to scrape information from: ");
                var craigslistCity = Console.ReadLine() ?? string.Empty;

                Console.Write("Enter the CraigList category: ");
                var craigslistCategory = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategory}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                        ;
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria.");
                    }
                }
                Console.WriteLine();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }

示例#3

0

显示文件

        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Enter which city you would like to scrape info from:");
                var cityName = Console.ReadLine();

                Console.WriteLine("Enter which the CraigsList category:");
                var categoryName = Console.ReadLine();

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"https://{cityName.Replace(" ", string.Empty)}.craigslist.org/{Method}/{categoryName}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithParts(new ScrapeCriteriaPartBuilder()
                                                               .WithRegex(@">(.*?)</a>")
                                                               .WithRegexOption(RegexOptions.Singleline)
                                                               .Build())
                                                    .WithParts(new ScrapeCriteriaPartBuilder()
                                                               .WithRegex(@"href=\""(.*?)\""")
                                                               .WithRegexOption(RegexOptions.Singleline)
                                                               .Build())
                                                    .Build();

                    Scraper scraper          = new Scraper();
                    var     scraperdElements = scraper.Scrape(scrapeCriteria);

                    if (scraperdElements.Any())
                    {
                        foreach (var scrapedElement in scraperdElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("there were no matches for scraping");
                    }
                }
                Console.ReadLine();
            }
            catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
        }

示例#4

0

显示文件

        static void Main(string[] args)
        {
            Console.WriteLine("Please enter which city you would like to scrape:");

            var craigsListCity = Console.ReadLine() ?? string.Empty;

            Console.WriteLine("Please enter the CraigsList category you would like to scrape:");

            var craigsListCategory = Console.ReadLine() ?? string.Empty;

            using (WebClient client = new WebClient())
            {
                string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategory}");

                ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                .WithData(content)
                                                .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result=title hdrlnk\"">(.*?)</a>")
                                                .WithRegexOption(System.Text.RegularExpressions.RegexOptions.ExplicitCapture)
                                                .WithPart(new ScrapeCriteriaPartBuilder()
                                                          .WithRegex(@">(.*?)</a>")
                                                          .WithRegexOption(System.Text.RegularExpressions.RegexOptions.Singleline)
                                                          .Build())
                                                .WithPart(new ScrapeCriteriaPartBuilder()
                                                          .WithRegex(@"href=\""(.*?)\""")
                                                          .WithRegexOption(System.Text.RegularExpressions.RegexOptions.Singleline)
                                                          .Build())
                                                .Build();

                Scraper scraper = new Scraper();

                var scrapedElements = scraper.Scrape(scrapeCriteria);

                if (scrapedElements.Any())
                {
                    foreach (var scrapedElement in scrapedElements)
                    {
                        Console.WriteLine(scrapedElement);
                    }
                }
                else
                {
                    Console.WriteLine("no matches");
                }
            }
        }

示例#5

0

显示文件

文件： Program.cs 项目： PauloB04/WebScraper

        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter the city you would like to scrape information from:");
                var craigsListCity = Console.ReadLine() ?? String.Empty;

                WebData webData = new WebDataBuilder()
                                  .WithCity(craigsListCity)
                                  .Build();

                WebDownloader downloadContent = new WebDownloader();

                Content = downloadContent.DownloadContentFrom(webData);

                CategoryScraper scrapeCategory = new CategoryScraper();
                Categories = scrapeCategory.GetCategoryFrom(Content);

                var userCategory = "sss";

                if (Categories.Any())
                {
                    int x = Categories.Count;
                    for (int c = 0; c < x; c += 2)
                    {
                        Console.WriteLine("Category: {0}, Value: {1}", Categories[c + 1], Categories[c]);
                        Console.WriteLine();
                    }

                    Console.Write("Please enter the \"Value\" of the category you'd like to scrape elements from:");
                    userCategory = Console.ReadLine() ?? String.Empty;
                }
                else
                {
                    Console.WriteLine("There were no elements found in the category list.");
                    Console.Write("A default category will be chosen for you.");
                }

                webData = new WebDataBuilder()
                          .WithCity(craigsListCity)
                          .WithCategory(userCategory)
                          .Build();

                Content = downloadContent.DownloadContentFrom(webData);

                //Need to check for errors on userCategory

                // https://boston.craigslist.org/search //link example for city only
                // https://boston.craigslist.org/search/cta //link example w/ category

                ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                .WithData(Content)
                                                .WithRegex(@"<a href=""(.*?)"" data-id=""(.*?)"" class=""(.*?)"">(.*?)</a>") //this regex pattern works
                                                .WithRegexOption(RegexOptions.ExplicitCapture)
                                                .WithParts(new ScrapeCriteriaPartBuilder()
                                                           .WithRegex(@">(.*?)<")
                                                           .WithRegexOption(RegexOptions.Singleline)
                                                           .Build())
                                                .WithParts(new ScrapeCriteriaPartBuilder()
                                                           .WithRegex(@"href=""(.*?)""")
                                                           .WithRegexOption(RegexOptions.Singleline)
                                                           .Build())
                                                .Build();

                Scraper scraper = new Scraper();

                var scrapedElements = scraper.Scrape(scrapeCriteria);

                if (scrapedElements.Any())
                {
                    int count = 1;
                    foreach (var scrapedElement in scrapedElements)
                    {
                        Console.WriteLine(scrapedElement);

                        if (count % 2 == 0)
                        {
                            Console.WriteLine();
                        }

                        count++;
                    }
                }
                else
                {
                    Console.WriteLine("There were no matches found for the specified scrape Criteria.");
                }
            }
            catch (Exception ex) { Console.WriteLine("There was an error found: {0}", ex.Message); }

            Console.WriteLine();
            Console.WriteLine("The program will close shortly, please acknowledge by pressing any key.");
            Console.ReadKey();
        }

C# (CSharp) WebScraper ScrapeCriteriaBuilder示例