예제 #1
0
        public void FindCollectionWithTwoParts()
        {
            // Test whether scraper find link & description
            var content = "Some dummy data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more dummy data";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") // regex to pick out specific things from html element <a>
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@">(.*?)</a>") // specifying parts in the link - using builder within a builder
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build())
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@"href=\""(.*?)\""")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build())
                                            .Build(); // Now building all of it

            var foundElements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 2);
            Assert.IsTrue(foundElements[0] == "some text"); // because this is the description
            Assert.IsTrue(foundElements[1] == "http://domain.com");
        }
예제 #2
0
        public void FindCollectionWithTwoParts()
        {
            var content = "Some fluff data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more fluff data";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@">(.*?)</a>")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build()
                                                      )
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@"href=\""(.*?)\""")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build()
                                                      )
                                            .Build();

            var foundElements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 2);
            Assert.IsTrue(foundElements[0] == "some text");
            Assert.IsTrue(foundElements[1] == "http://domain.com");
        }
예제 #3
0
        public void FindCollectionWithTwoParts()
        {
            string content = "Lorem ipsum <a href=\"http://domain.com\" data-id=\"dataID\" class=\"result-title hdrlnk\">Item Description</a> dolor sit amet ";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            // RegEx for entire listing element
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            // Build scraper for listing description part
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@">(.*?)</a>")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build())
                                            // Build scraper for listing URL part
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@"href=\""(.*?)\""")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build())
                                            .Build();

            var foundElements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 2);
            Assert.IsTrue(foundElements[0] == "Item Description");
            Assert.IsTrue(foundElements[1] == "http://domain.com");
        }
        public void FindCollectionWithTwoParts()
        {
            var content = @"some stuff data <a href=""https://boston.craigslist.org/bmw/ctd/d/waltham-2010-lexus-is-250c-base-2dr/7073589362.html"" data-id=""7073589362"" class=""result-title hdrlnk"">2010 Lexus IS 250C Base 2dr Convertible 6A - EASY FINANCING!</a> and more fluff data";



            var scrapeCriteria = new ScrapeCriteriaBuilder()
                                 .WithData(content)
                                 .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                 .WithRegexOption(RegexOptions.ExplicitCapture)
                                 .WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder()
                                                         .WithRegex(@">(.*?)</a>")
                                                         .WithRegexOption(RegexOptions.Singleline)
                                                         .Build()
                                                         ).WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder()
                                                                                  .WithRegex(@"href=\""(.*?)\""")
                                                                                  .WithRegexOption(RegexOptions.Singleline)
                                                                                  .Build()
                                                                                  )

                                 .Build();


            var foundElements = _scrapper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 2);
            Assert.IsTrue(foundElements[0] == @"2010 Lexus IS 250C Base 2dr Convertible 6A - EASY FINANCING!");
            Assert.IsTrue(foundElements[1] == @"https://boston.craigslist.org/bmw/ctd/d/waltham-2010-lexus-is-250c-base-2dr/7073589362.html");
        }
예제 #5
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from: ");
                var craigslistCity = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please enter the CraigsList category that you would like to scrape: ");
                var craigslistCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient webClient = new WebClient())
                {
                    string         content        = webClient.DownloadString($"https://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}");
                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }

                    else
                    {
                        Console.WriteLine("There are no matches for the spicified criteria");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }

            /*MatchCollection matches = Regex.Matches("blah blah blah...This is my cat...bla", "This is my [a-z]at");
             *
             * foreach (var match in matches)
             * {
             *  Console.WriteLine(match);
             * }
             */

            Console.ReadKey();
        }
예제 #6
0
        public void FindCollectionWithTwoParts()
        {
            var content = "<a href=\"https://boston.craigslist.org/nos/cto/d/2001-ford-350-xlt/6570954158.html\" data-id=\"6570954158\" class=\"result-title hdrlnk\">2001 Ford F 350 XLT</a>";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content) // The content of the entire pade that we download
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@">(.*?)</a>")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build())
                                            .WithPart(new ScrapeCriteriaPartBuilder()
                                                      .WithRegex(@"href=\""(.*?)\""")
                                                      .WithRegexOption(RegexOptions.Singleline)
                                                      .Build())
                                            .Build();

            var foundElements = _scraper.Scrape(scrapeCriteria);

            // One element is found
            Assert.True(foundElements.Count == 2);
            Assert.True(foundElements[0] == "2001 Ford F 350 XLT");
            Assert.True(foundElements[1] == "https://boston.craigslist.org/nos/cto/d/2001-ford-350-xlt/6570954158.html");
            // The element found is indeed equal to content
            // Assert.True(foundElements[0] == content);
        }
예제 #7
0
        public void ScraperTakesTwoParts_ReturnsLinkAndContentWithinTag()
        {
            string content = "Some data present here <a href=\"anysite.com\" class=\"anyClass\"> Content within tag </a>";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=""(.*?)"" class=""(.*?)""> (.*?) </a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .WithParts(new ScrapeCriteriaPartBuilder()
                                                       .WithRegex(@"href=""(.*?)""")
                                                       .WithRegexOption(RegexOptions.Singleline)
                                                       .Build())
                                            .WithParts(new ScrapeCriteriaPartBuilder()
                                                       .WithRegex(@">(.*?)<")
                                                       .WithRegexOption(RegexOptions.Singleline)
                                                       .Build())
                                            .Build();

            var           scrapedElements  = _scraper.Scrape(scrapeCriteria);
            List <string> expectedElements = new List <string> {
                "anysite.com", " Content within tag "
            };

            Assert.IsTrue(scrapedElements.Count == 2);
            Assert.AreEqual(scrapedElements[0], expectedElements[0]);
            Assert.AreEqual(scrapedElements[1], expectedElements[1]);
        }
예제 #8
0
        static void Main(string[] args)
        {
            Console.Clear();
            // MatchCollection matches = Regex.Matches("This is my bat... This is my rat... This is my fat... This is my cat... This is my chat",
            //                                          "This is my [a-z]at");
            // foreach(var match in matches)
            //     Console.WriteLine(match);


            try
            {
                Console.Write("Inserissci il nome del luogo: ");
                var craigListCity = Console.ReadLine() ?? "";
                Console.Write("Inserissci la categoria craigList: ");
                string craigListCategory = Console.ReadLine();

                using (WebClient client = new WebClient()) // using e parentesi per liberare spazio una volta finito
                {
                    string indirizzo = $"http://{craigListCity.Replace(" ", string.Empty)}.craigList.org/{method}/{craigListCategory}";
                    Console.WriteLine("<<<<<<<<<<<<<<<<<<<<<<" + indirizzo);
                    string content = client.DownloadString(
                        $"http://{craigListCity.Replace(" ", string.Empty)}.craigList.org/{method}/{craigListCategory}");
                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Count != 0)
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine("• " + scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("Non ho trovato niente!");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #9
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which city you would like to scrape from: ");

                // If no input, create an empty string:
                var craigslistCity = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please enter the CraigsList category that you would like to scrape: ");
                var craigslistCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    // "Replace" removes unnecessary spaces from the input of the city name. New York = NewYork:
                    string content = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    // Grab everything from within the group:
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }

                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria.");
                    }
                }
            }

            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #10
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from: ");
                var craigsListCity = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please enter the CraigsList category that you would like to scrape: ");
                var craigsListCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    //.Replace will remove all the spaces in user input i.e. new york to newyork.
                    //DownloadString method from System.Net retrieves information from URL as a string. i.e. <!DOCTYPE html>\n<html class=...
                    string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategoryName}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    //Retrieves source code from the website
                                                    .WithData(content)
                                                    //.WithRegex is looking for the FULL MATCH
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOptions(RegexOptions.ExplicitCapture)
                                                    //.ScrapeCriteriaBuilder.WithPart(ScrapeCriteriaPartBuilder) holds the (.*?).Which is title of listing and adds to ScrapeCriteriaParts List
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    //.ScrapeCriteriaBuilder.WithPart(ScrapeCriteriaPartBuilder) holds the (.*?).Which is link to the listing and adds to ScrapeCriteriaParts List
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();
                    //Takes in the DATA (source code), Regex and scrapes
                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #11
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from:");
                var craigslistCity = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please enter the CraigsList category that you would like to scrape:");
                var craigslistCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria.");
                    }
                    Console.WriteLine("Done. Press any key to exit...");
                    Console.ReadKey();
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
                Console.WriteLine("Press any key to exit...");
                Console.ReadKey();
            }
        }
예제 #12
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Enter the City to scrape information from");
                var city = Console.ReadLine();

                Console.WriteLine("Enter the category to scrape information");
                var category = Console.ReadLine();


                using (WebClient client = new WebClient())
                {
                    client.Encoding = Encoding.UTF8;
                    var url = $"https://{city.Replace(" ", string.Empty)}.craigslist.org/search/{category}";

                    string content = client.DownloadString(url);

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex("<a href=\"(.*?)\" data-id=\"(.*?)\" class=\"result-title hdrlnk\">(.*?)</a>")
                                                    .WithRegexOptions(RegexOptions.ExplicitCapture)
                                                    .WithParts(new ScrapeCriteriaPartBuilder()
                                                               .WithRegex(">(.*?)</a>")
                                                               .WithRegexOptions(RegexOptions.Singleline)
                                                               .Build())
                                                    .WithParts(new ScrapeCriteriaPartBuilder()
                                                               .WithRegex("href=\"(.*?)\"")
                                                               .WithRegexOptions(RegexOptions.Singleline)
                                                               .Build())
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var elements = scraper.Scrape(scrapeCriteria);
                    if (elements.Any())
                    {
                        foreach (var element in elements)
                        {
                            Console.WriteLine(element);
                        }
                    }
                    else
                    {
                        Console.WriteLine("No matches found");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #13
0
        static void Main(string[] args)
        {
            try
            {
                Console.Write("Please enter a city: ");
                var city = Console.ReadLine() ?? string.Empty;

                Console.Write("Please enter a category: ");
                var category = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content =
                        client.DownloadString(
                            $"http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}"
                            );
                    var scrapeCriteria = new ScrapeCriteriaBuilder()
                                         .WithData(content)
                                         .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                         .WithRegexOption(RegexOptions.ExplicitCapture)
                                         .WithPart(new ScrapeCriteriaPartBuilder()
                                                   .WithRegex(@">(.*?)</a>")
                                                   .WithRegexOption(RegexOptions.Singleline)
                                                   .Build())
                                         .WithPart(new ScrapeCriteriaPartBuilder()
                                                   .WithRegex(@"href=\""(.*?)\""")
                                                   .WithRegexOption(RegexOptions.Singleline)
                                                   .Build())
                                         .Build();

                    var scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches.");
                    }
                }
            }
            catch (Exception exception)
            {
                Console.WriteLine(exception.Message);
            }
        }
예제 #14
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from:");
                string craigsListsCity = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please enter the CraigsList category from availables:");
                string craigsListsCategory = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"http://{craigsListsCity.Replace(" ", string.Empty)}." +
                                                           $"craigslist.org/{Method}/{craigsListsCategory}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder().
                                                    WithData(content).
                                                    WithRegex("<a href=\"(.*?)\" data-id=\"(.*?)\" class=\"result-title hdrlnk\">(.*?)</a>").
                                                    WithRegexOption(RegexOptions.ExplicitCapture).
                                                    WithPart(new ScrapeCriteriaPartBuilder(). // this part will get the description
                                                             WithRegex(">(.*?)</a>").
                                                             WithRegexOption(RegexOptions.Singleline).
                                                             Build()).
                                                    WithPart(new ScrapeCriteriaPartBuilder(). // this part will get the link
                                                             WithRegex("href=\"(.*?)\"").
                                                             WithRegexOption(RegexOptions.Singleline).
                                                             Build()).
                                                    Build(); // @ is used to escape any characters I guess

                    Scraper scraper         = new Scraper();
                    var     scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #15
0
        public void FindCollectionWithNoParts()
        {
            string content = "Lorem ipsum <a href=\"http://domain.com\" data-id=\"dataID\" class=\"result-title hdrlnk\">Item Description</a> dolor sit amet";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .Build();

            var foundElements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 1);
        }
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please provide city in USA you would like to scrape info from Craigslist, for ex.: Boston");
                string craigslistCityName = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please provide Craigslist category you would like to scrape, for ex.: cta (stands for Cars Trucks Automotive category, to obtain that simply navigate to desired category in web browser and check URL for name of category to provide)");
                string craigslistCategoryName = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"http://{craigslistCityName.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper       scraper         = new Scraper();
                    List <string> scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (string element in scrapedElements)
                        {
                            Console.WriteLine($"Found match {element}");
                        }
                    }
                    else
                    {
                        Console.WriteLine("There was no match for specified scraping criteria");
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e.Message);
            }
        }
예제 #17
0
        public void FoundElementsWithNoParts()
        {
            var content = "Some fluff data <a href=\"http://domain.com\" data-id=\"someId\">some text</a> more fluff data";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"">(.*?)</a>")
                                            .WithRegexOptions(RegexOptions.ExplicitCapture)
                                            .Build();

            var elements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(elements.Count == 1);
            Assert.IsTrue(elements[0] == "<a href=\"http://domain.com\" data-id=\"someId\">some text</a>");
        }
        public void FindCollectionWithNoParts()
        {
            string testHtml = "Some dummy text <a href=\"https://boston.craigslist.org/gbs/ctd/d/2014-gmc-sierra-1500-regular/6765227753.html\" data-id=\"6765227753\" class=\"result-title hdrlnk\">2014 GMC Sierra 1500 Regular Cab Pickup 2D 6 1/2 ft pickup RED -</a> more dummy text";

            ScrapeCriteria scrapeCriteriaWithNoPart = new ScrapeCriteriaBuilder()
                                                      .WithData(testHtml)
                                                      .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                      .WithRegexOption(RegexOptions.ExplicitCapture)
                                                      .Build();

            List <string> scrapeResults = scraper.Scrape(scrapeCriteriaWithNoPart);

            Assert.IsTrue(scrapeResults.Count.Equals(1));
            Assert.IsTrue(scrapeResults[0] == "<a href=\"https://boston.craigslist.org/gbs/ctd/d/2014-gmc-sierra-1500-regular/6765227753.html\" data-id=\"6765227753\" class=\"result-title hdrlnk\">2014 GMC Sierra 1500 Regular Cab Pickup 2D 6 1/2 ft pickup RED -</a>");
        }
        public void FindCollectionWithNoParts()
        {
            var content = "Some filler data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more filler data";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                            .WithRegexOption(System.Text.RegularExpressions.RegexOptions.ExplicitCapture)
                                            .Build();

            var foundElements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 1);
            Assert.IsTrue(foundElements[0] == "<a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a>");
        }
예제 #20
0
        static void Main(string[] args)
        {
            try
            {
                Write("Please enter the city you like to be scraped:\t");
                var craiglistCity = ReadLine() ?? string.Empty;

                Write("Please enter the CraigList catagory:\t");
                var craiglistCatagory = ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString($"http://{craiglistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craiglistCatagory}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title-hdrlnk\"">(.*?)</a>")
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper       scraper      = new Scraper();
                    List <string> scrapedItems = scraper.Scrape(scrapeCriteria);

                    if (scrapedItems.Any())
                    {
                        foreach (var item in scrapedItems)
                        {
                            WriteLine();
                        }
                    }
                    else
                    {
                        WriteLine("There are no matches found .....");
                    }
                }
            }
            catch (Exception ex)
            {
                WriteLine(ex);
            }
        }
예제 #21
0
        public void ScraperTakesNoParts_ReturnsWholeAnchorTag()
        {
            string content = "Some data present here <a href=\"anysite.com\" class=\"anyClass\"> Content within tag </a>";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=""(.*?)"" class=""(.*?)""> (.*?) </a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .Build();

            var scrapedElement  = _scraper.Scrape(scrapeCriteria);
            var expectedElement = "<a href=\"anysite.com\" class=\"anyClass\"> Content within tag </a>";

            Assert.IsTrue(scrapedElement.Count == 1);
            Assert.AreEqual(scrapedElement[0], expectedElement);
        }
예제 #22
0
        public void FindCollectionWithNoParts()
        {
            // Test whether scraper find link & description
            var content = "Some dummy data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more dummy data";

            ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                            .WithData(content)
                                            .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                            .WithRegexOption(RegexOptions.ExplicitCapture)
                                            .Build();

            var foundElements = _scraper.Scrape(scrapeCriteria);

            Assert.IsTrue(foundElements.Count == 1);
            Assert.IsTrue(foundElements[0] == "<a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a>");
        }
예제 #23
0
        static void Main(string[] args)
        {
            try
            {
                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString("https://boston.craigslist.org/d/cars-trucks/search/cta");
                    Console.WriteLine(content);

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .withData(content)
                                                    .withRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .withRegexOption(RegexOptions.ExplicitCapture)
                                                    .withPart(new ScrapeCriteriaPartBuilder()
                                                              .withRegex(@">(.*?)</a>")
                                                              .withRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .withPart(new ScrapeCriteriaPartBuilder()
                                                              .withRegex(@"href=\""(.*?)\""")
                                                              .withRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    Scraper scraper         = new Scraper();
                    var     scrapedElements = scraper.Scrape(scrapeCriteria);
                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There are no match");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
예제 #24
0
        public List <string> GetCategoryFrom(string webPage)
        {
            try
            {
                //<select id="subcatAbb" class="js-only"> (options are here) </select> // location of the categories
                //<option value="baa">baby+kids</option> // options syntax

                //<option value="(.*?)">(.*?)</option> // found results for this on atom
                //<select id="(.*?)" class="(.*?)"> // there is a match for this on atom, need to figure out how to target rest
                //<select id="(.*?)" class="(.*?)">(.*)</select> //this works in atom but all content must be in a single line

                // <select id="subcatAbb" class="(.*)">((\s*)(.*)){0,46}(\s*)</select> //this returns the selects + options

                ScrapeCriteria categoryCriteria = new ScrapeCriteriaBuilder()
                                                  .WithData(webPage)
                                                  .WithRegex(@"<select id=""subcatAbb"" class=""(.*?)"">((\s*)(.*)){0,46}(\s*)</select>")
                                                  .WithRegexOption(RegexOptions.ExplicitCapture)
                                                  .Build();

                Scraper scrapeCategory = new Scraper();
                var     categoriesHTML = scrapeCategory.Scrape(categoryCriteria);

                ScrapeCriteria categoryValueAndName = new ScrapeCriteriaBuilder()
                                                      .WithData(categoriesHTML[0].ToString())
                                                      .WithRegex(@"<option value=""(.*)"">(.*)</option>")
                                                      .WithRegexOption(RegexOptions.ExplicitCapture)
                                                      .WithParts(new ScrapeCriteriaPartBuilder()
                                                                 .WithRegex(@"value=""(.*)""")
                                                                 .WithRegexOption(RegexOptions.Singleline)
                                                                 .Build())
                                                      .WithParts(new ScrapeCriteriaPartBuilder()
                                                                 .WithRegex(@">(.*)<")
                                                                 .WithRegexOption(RegexOptions.Singleline)
                                                                 .Build())
                                                      .Build();

                CategoriesFound = scrapeCategory.Scrape(categoryValueAndName);
            }
            catch (Exception ex) { Console.WriteLine("There was an error while trying to get the category: {0}", ex.Message); }

            return(CategoriesFound);
        }
예제 #25
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which city you would like to scrape information from:");
                string craigslistCity = Console.ReadLine() ?? string.Empty;

                Console.WriteLine("Please enter which category you would like to scrape:");
                string craigslistCategory = Console.ReadLine() ?? string.Empty;

                using (WebClient client = new WebClient())
                {
                    string         content        = client.DownloadString("");
                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"<title>\s*(.+?)\s*</title>")
                                                    .WithRegexOptions(RegexOptions.ExplicitCapture)
                                                    .Build();

                    Scraper scraper         = new Scraper();
                    var     scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches found for the specified scrape criteria");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #26
0
        static void Main(string[] args)
        {
            Console.WriteLine("Enter which city you would like to scrape information for");
            var craigListCity = Console.ReadLine() ?? string.Empty;

            Console.WriteLine("Please enter the craig list category you would like to scrape");
            var craigListCategoryName = Console.ReadLine() ?? string.Empty;

            try
            {
                using var client = new WebClient();
                string content = client.DownloadString($"http://{craigListCity.Replace(" ",String.Empty)}.craigslist.org/{Method}/{craigListCategoryName}");

                ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder().WithData(content).WithRegex(MainRegex).WithRegexOption(RegexOptions.ExplicitCapture)
                                                .WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder().WithRegex(@">(.*?)</a>").WithRegexOption(RegexOptions.Singleline).Build())
                                                .WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder().WithRegex(@"href=\""(.*?)\""").WithRegexOption(RegexOptions.Singleline).Build())
                                                .Build();
                var scrapper        = new Scrapper();
                var scrapedElements = scrapper.Scrape(scrapeCriteria);

                if (scrapedElements.Any())
                {
                    foreach (var scrapedElement in scrapedElements)
                    {
                        Console.WriteLine(scrapedElement);
                    }
                }
                else
                {
                    Console.WriteLine("There were no matched lines for the specified criteria");
                }
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }
        }
예제 #27
0
        static void Main(string[] args)
        {
            try
            {
                // Get city and category from user
                Console.Write("City to scrape information for: ");
                string city = Console.ReadLine() ?? string.Empty;
                Console.Write("CraigsList category: ");
                string category = Console.ReadLine() ?? string.Empty;

                // Use WebClient to pull web page then scrape the listing URL and descriptions
                using (WebClient client = new WebClient())
                {
                    Console.WriteLine($"Scraping page http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}");
                    string content = client.DownloadString($"http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}");

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    // RegEx for entire listing element
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    // Build scraper for listing description part
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    // Build scraper for listing URL part
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    // Call scraper to extract listing elements from page then extract parts from listing elements
                    Scraper scraper         = new Scraper();
                    var     scrapedElements = scraper.Scrape(scrapeCriteria);

                    // Display scraped parts if any exists
                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the entered city and category.");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
            finally
            {
                Console.Write("Press any key to exit.");
                Console.ReadLine();
            }
        }
예제 #28
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Please enter which Udemy course URL that you would like to scrape:");
                var udemyCourseURL = Console.ReadLine() ?? string.Empty;


                using (WebClient client = new WebClient())
                {
                    string content = client.DownloadString(udemyCourseURL);

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content).Build();

                    /*
                     * string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategoryName}");
                     *
                     * ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                     *  .WithData(content)
                     *  .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)")
                     *  .WithRegexOption(RegexOptions.ExplicitCapture)
                     *  .WithPart(new ScrapeCriteriaPartBuilder()
                     *      .WithRegex(@">(.*?</a>")
                     *      .WithRegexOption(RegexOptions.Singleline)
                     *      .Build())
                     *  .WithPart(new ScrapeCriteriaPartBuilder()
                     *      .WithRegex(@"href=\""(.*?)\""")
                     *      .WithRegexOption(RegexOptions.Singleline)
                     *      .Build())
                     *  .Build();
                     */

                    string content = client.DownloadString($"https://www.udemy.com/course/learn-csharp-by-building-applications/");

                    using (FileStream fileStream = new FileStream("output.html", FileMode.Create))
                    {
                        using (StreamWriter streamWriter = new StreamWriter(fileStream))
                        {
                            streamWriter.Write(content);
                        }
                    }

                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content)
                                                    .WithRegex(@"/watch\?v=(.*?)~"
                                                               .Replace('~', '\"'))
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .Build();

                    Scraper scraper = new Scraper();

                    var scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        Console.WriteLine("There were no matches for the specified scrape criteria.");
                    }
                }
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
        }
예제 #29
0
        static void Main(string[] args)
        {
            try
            {
                Console.WriteLine("Enter which city you would like scrape information from");
                var craigslistCity = Console.ReadLine() ?? string.Empty;

                System.Console.WriteLine("Please enter the Craiglist category you would like to scrape");
                var craiglistCategoryName = Console.ReadLine() ?? string.Empty;

                // url address to download
                string urlAddress = $"http://{craigslistCity.Replace(" ", string.Empty)}.craiglist.org/{Method}/{craiglistCategoryName}";

                // System.Console.WriteLine (urlAddress);

                using (WebClient client = new WebClient()) {
                    // Download the page
                    string content = client.DownloadString(urlAddress);

                    // System.Console.WriteLine (content);

                    /*
                     *  <a href="https://boston.craigslist.org/nos/cto/d/2001-ford-350-xlt/6570954158.html" data-id="6570954158" class="result-title hdrlnk">2001 Ford F 350 XLT</a>
                     */

                    // Create the search criteria
                    ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder()
                                                    .WithData(content) // The content of the entire pade that we download
                                                    .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>")
                                                    .WithRegexOption(RegexOptions.ExplicitCapture)
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@">(.*?)</a>")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .WithPart(new ScrapeCriteriaPartBuilder()
                                                              .WithRegex(@"href=\""(.*?)\""")
                                                              .WithRegexOption(RegexOptions.Singleline)
                                                              .Build())
                                                    .Build();

                    // Start scraping
                    Scraper scraper         = new Scraper();
                    var     scrapedElements = scraper.Scrape(scrapeCriteria);

                    if (scrapedElements.Any())
                    {
                        foreach (var scrapedElement in scrapedElements)
                        {
                            System.Console.WriteLine(scrapedElement);
                        }
                    }
                    else
                    {
                        System.Console.WriteLine("There were no matches for the specified scrape criteria ");
                    }
                }
            }
            catch (Exception ex)
            {
                System.Console.WriteLine(ex.Message);
            }
        }