public void FindCollectionWithTwoParts() { // Test whether scraper find link & description var content = "Some dummy data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more dummy data"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") // regex to pick out specific things from html element <a> .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") // specifying parts in the link - using builder within a builder .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); // Now building all of it var foundElements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 2); Assert.IsTrue(foundElements[0] == "some text"); // because this is the description Assert.IsTrue(foundElements[1] == "http://domain.com"); }
public void FindCollectionWithTwoParts() { var content = "Some fluff data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more fluff data"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build() ) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build() ) .Build(); var foundElements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 2); Assert.IsTrue(foundElements[0] == "some text"); Assert.IsTrue(foundElements[1] == "http://domain.com"); }
public void FindCollectionWithTwoParts() { string content = "Lorem ipsum <a href=\"http://domain.com\" data-id=\"dataID\" class=\"result-title hdrlnk\">Item Description</a> dolor sit amet "; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) // RegEx for entire listing element .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) // Build scraper for listing description part .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) // Build scraper for listing URL part .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); var foundElements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 2); Assert.IsTrue(foundElements[0] == "Item Description"); Assert.IsTrue(foundElements[1] == "http://domain.com"); }
public void FindCollectionWithTwoParts() { var content = @"some stuff data <a href=""https://boston.craigslist.org/bmw/ctd/d/waltham-2010-lexus-is-250c-base-2dr/7073589362.html"" data-id=""7073589362"" class=""result-title hdrlnk"">2010 Lexus IS 250C Base 2dr Convertible 6A - EASY FINANCING!</a> and more fluff data"; var scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build() ).WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build() ) .Build(); var foundElements = _scrapper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 2); Assert.IsTrue(foundElements[0] == @"2010 Lexus IS 250C Base 2dr Convertible 6A - EASY FINANCING!"); Assert.IsTrue(foundElements[1] == @"https://boston.craigslist.org/bmw/ctd/d/waltham-2010-lexus-is-250c-base-2dr/7073589362.html"); }
static void Main(string[] args) { try { Console.WriteLine("Please enter which city you would like to scrape information from: "); var craigslistCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter the CraigsList category that you would like to scrape: "); var craigslistCategoryName = Console.ReadLine() ?? string.Empty; using (WebClient webClient = new WebClient()) { string content = webClient.DownloadString($"https://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There are no matches for the spicified criteria"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } /*MatchCollection matches = Regex.Matches("blah blah blah...This is my cat...bla", "This is my [a-z]at"); * * foreach (var match in matches) * { * Console.WriteLine(match); * } */ Console.ReadKey(); }
public void FindCollectionWithTwoParts() { var content = "<a href=\"https://boston.craigslist.org/nos/cto/d/2001-ford-350-xlt/6570954158.html\" data-id=\"6570954158\" class=\"result-title hdrlnk\">2001 Ford F 350 XLT</a>"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) // The content of the entire pade that we download .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); var foundElements = _scraper.Scrape(scrapeCriteria); // One element is found Assert.True(foundElements.Count == 2); Assert.True(foundElements[0] == "2001 Ford F 350 XLT"); Assert.True(foundElements[1] == "https://boston.craigslist.org/nos/cto/d/2001-ford-350-xlt/6570954158.html"); // The element found is indeed equal to content // Assert.True(foundElements[0] == content); }
public void ScraperTakesTwoParts_ReturnsLinkAndContentWithinTag() { string content = "Some data present here <a href=\"anysite.com\" class=\"anyClass\"> Content within tag </a>"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=""(.*?)"" class=""(.*?)""> (.*?) </a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=""(.*?)""") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)<") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); var scrapedElements = _scraper.Scrape(scrapeCriteria); List <string> expectedElements = new List <string> { "anysite.com", " Content within tag " }; Assert.IsTrue(scrapedElements.Count == 2); Assert.AreEqual(scrapedElements[0], expectedElements[0]); Assert.AreEqual(scrapedElements[1], expectedElements[1]); }
static void Main(string[] args) { Console.Clear(); // MatchCollection matches = Regex.Matches("This is my bat... This is my rat... This is my fat... This is my cat... This is my chat", // "This is my [a-z]at"); // foreach(var match in matches) // Console.WriteLine(match); try { Console.Write("Inserissci il nome del luogo: "); var craigListCity = Console.ReadLine() ?? ""; Console.Write("Inserissci la categoria craigList: "); string craigListCategory = Console.ReadLine(); using (WebClient client = new WebClient()) // using e parentesi per liberare spazio una volta finito { string indirizzo = $"http://{craigListCity.Replace(" ", string.Empty)}.craigList.org/{method}/{craigListCategory}"; Console.WriteLine("<<<<<<<<<<<<<<<<<<<<<<" + indirizzo); string content = client.DownloadString( $"http://{craigListCity.Replace(" ", string.Empty)}.craigList.org/{method}/{craigListCategory}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Count != 0) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine("• " + scrapedElement); } } else { Console.WriteLine("Non ho trovato niente!"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Main(string[] args) { try { Console.WriteLine("Please enter which city you would like to scrape from: "); // If no input, create an empty string: var craigslistCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter the CraigsList category that you would like to scrape: "); var craigslistCategoryName = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { // "Replace" removes unnecessary spaces from the input of the city name. New York = NewYork: string content = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) // Grab everything from within the group: .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the specified scrape criteria."); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Main(string[] args) { try { Console.WriteLine("Please enter which city you would like to scrape information from: "); var craigsListCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter the CraigsList category that you would like to scrape: "); var craigsListCategoryName = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { //.Replace will remove all the spaces in user input i.e. new york to newyork. //DownloadString method from System.Net retrieves information from URL as a string. i.e. <!DOCTYPE html>\n<html class=... string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategoryName}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() //Retrieves source code from the website .WithData(content) //.WithRegex is looking for the FULL MATCH .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOptions(RegexOptions.ExplicitCapture) //.ScrapeCriteriaBuilder.WithPart(ScrapeCriteriaPartBuilder) holds the (.*?).Which is title of listing and adds to ScrapeCriteriaParts List .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) //.ScrapeCriteriaBuilder.WithPart(ScrapeCriteriaPartBuilder) holds the (.*?).Which is link to the listing and adds to ScrapeCriteriaParts List .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); //Takes in the DATA (source code), Regex and scrapes var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the specified scrape criteria"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Main(string[] args) { try { Console.WriteLine("Please enter which city you would like to scrape information from:"); var craigslistCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter the CraigsList category that you would like to scrape:"); var craigslistCategoryName = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString($"http://{craigslistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the specified scrape criteria."); } Console.WriteLine("Done. Press any key to exit..."); Console.ReadKey(); } } catch (Exception ex) { Console.WriteLine(ex.Message); Console.WriteLine("Press any key to exit..."); Console.ReadKey(); } }
static void Main(string[] args) { try { Console.WriteLine("Enter the City to scrape information from"); var city = Console.ReadLine(); Console.WriteLine("Enter the category to scrape information"); var category = Console.ReadLine(); using (WebClient client = new WebClient()) { client.Encoding = Encoding.UTF8; var url = $"https://{city.Replace(" ", string.Empty)}.craigslist.org/search/{category}"; string content = client.DownloadString(url); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex("<a href=\"(.*?)\" data-id=\"(.*?)\" class=\"result-title hdrlnk\">(.*?)</a>") .WithRegexOptions(RegexOptions.ExplicitCapture) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(">(.*?)</a>") .WithRegexOptions(RegexOptions.Singleline) .Build()) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex("href=\"(.*?)\"") .WithRegexOptions(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var elements = scraper.Scrape(scrapeCriteria); if (elements.Any()) { foreach (var element in elements) { Console.WriteLine(element); } } else { Console.WriteLine("No matches found"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Main(string[] args) { try { Console.Write("Please enter a city: "); var city = Console.ReadLine() ?? string.Empty; Console.Write("Please enter a category: "); var category = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString( $"http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}" ); var scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); var scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches."); } } } catch (Exception exception) { Console.WriteLine(exception.Message); } }
static void Main(string[] args) { try { Console.WriteLine("Please enter which city you would like to scrape information from:"); string craigsListsCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter the CraigsList category from availables:"); string craigsListsCategory = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString($"http://{craigsListsCity.Replace(" ", string.Empty)}." + $"craigslist.org/{Method}/{craigsListsCategory}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder(). WithData(content). WithRegex("<a href=\"(.*?)\" data-id=\"(.*?)\" class=\"result-title hdrlnk\">(.*?)</a>"). WithRegexOption(RegexOptions.ExplicitCapture). WithPart(new ScrapeCriteriaPartBuilder(). // this part will get the description WithRegex(">(.*?)</a>"). WithRegexOption(RegexOptions.Singleline). Build()). WithPart(new ScrapeCriteriaPartBuilder(). // this part will get the link WithRegex("href=\"(.*?)\""). WithRegexOption(RegexOptions.Singleline). Build()). Build(); // @ is used to escape any characters I guess Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the specified scrape criteria"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
public void FindCollectionWithNoParts() { string content = "Lorem ipsum <a href=\"http://domain.com\" data-id=\"dataID\" class=\"result-title hdrlnk\">Item Description</a> dolor sit amet"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); var foundElements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 1); }
static void Main(string[] args) { try { Console.WriteLine("Please provide city in USA you would like to scrape info from Craigslist, for ex.: Boston"); string craigslistCityName = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please provide Craigslist category you would like to scrape, for ex.: cta (stands for Cars Trucks Automotive category, to obtain that simply navigate to desired category in web browser and check URL for name of category to provide)"); string craigslistCategoryName = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString($"http://{craigslistCityName.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigslistCategoryName}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); List <string> scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (string element in scrapedElements) { Console.WriteLine($"Found match {element}"); } } else { Console.WriteLine("There was no match for specified scraping criteria"); } } } catch (Exception e) { Console.WriteLine(e.Message); } }
public void FoundElementsWithNoParts() { var content = "Some fluff data <a href=\"http://domain.com\" data-id=\"someId\">some text</a> more fluff data"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"">(.*?)</a>") .WithRegexOptions(RegexOptions.ExplicitCapture) .Build(); var elements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(elements.Count == 1); Assert.IsTrue(elements[0] == "<a href=\"http://domain.com\" data-id=\"someId\">some text</a>"); }
public void FindCollectionWithNoParts() { string testHtml = "Some dummy text <a href=\"https://boston.craigslist.org/gbs/ctd/d/2014-gmc-sierra-1500-regular/6765227753.html\" data-id=\"6765227753\" class=\"result-title hdrlnk\">2014 GMC Sierra 1500 Regular Cab Pickup 2D 6 1/2 ft pickup RED -</a> more dummy text"; ScrapeCriteria scrapeCriteriaWithNoPart = new ScrapeCriteriaBuilder() .WithData(testHtml) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); List <string> scrapeResults = scraper.Scrape(scrapeCriteriaWithNoPart); Assert.IsTrue(scrapeResults.Count.Equals(1)); Assert.IsTrue(scrapeResults[0] == "<a href=\"https://boston.craigslist.org/gbs/ctd/d/2014-gmc-sierra-1500-regular/6765227753.html\" data-id=\"6765227753\" class=\"result-title hdrlnk\">2014 GMC Sierra 1500 Regular Cab Pickup 2D 6 1/2 ft pickup RED -</a>"); }
public void FindCollectionWithNoParts() { var content = "Some filler data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more filler data"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(System.Text.RegularExpressions.RegexOptions.ExplicitCapture) .Build(); var foundElements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 1); Assert.IsTrue(foundElements[0] == "<a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a>"); }
static void Main(string[] args) { try { Write("Please enter the city you like to be scraped:\t"); var craiglistCity = ReadLine() ?? string.Empty; Write("Please enter the CraigList catagory:\t"); var craiglistCatagory = ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString($"http://{craiglistCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craiglistCatagory}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title-hdrlnk\"">(.*?)</a>") .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); List <string> scrapedItems = scraper.Scrape(scrapeCriteria); if (scrapedItems.Any()) { foreach (var item in scrapedItems) { WriteLine(); } } else { WriteLine("There are no matches found ....."); } } } catch (Exception ex) { WriteLine(ex); } }
public void ScraperTakesNoParts_ReturnsWholeAnchorTag() { string content = "Some data present here <a href=\"anysite.com\" class=\"anyClass\"> Content within tag </a>"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=""(.*?)"" class=""(.*?)""> (.*?) </a>") .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); var scrapedElement = _scraper.Scrape(scrapeCriteria); var expectedElement = "<a href=\"anysite.com\" class=\"anyClass\"> Content within tag </a>"; Assert.IsTrue(scrapedElement.Count == 1); Assert.AreEqual(scrapedElement[0], expectedElement); }
public void FindCollectionWithNoParts() { // Test whether scraper find link & description var content = "Some dummy data <a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a> more dummy data"; ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); var foundElements = _scraper.Scrape(scrapeCriteria); Assert.IsTrue(foundElements.Count == 1); Assert.IsTrue(foundElements[0] == "<a href=\"http://domain.com\" data-id=\"someId\" class=\"result-title hdrlnk\">some text</a>"); }
static void Main(string[] args) { try { using (WebClient client = new WebClient()) { string content = client.DownloadString("https://boston.craigslist.org/d/cars-trucks/search/cta"); Console.WriteLine(content); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .withData(content) .withRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .withRegexOption(RegexOptions.ExplicitCapture) .withPart(new ScrapeCriteriaPartBuilder() .withRegex(@">(.*?)</a>") .withRegexOption(RegexOptions.Singleline) .Build()) .withPart(new ScrapeCriteriaPartBuilder() .withRegex(@"href=\""(.*?)\""") .withRegexOption(RegexOptions.Singleline) .Build()) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There are no match"); } } } catch (Exception ex) { Console.WriteLine(ex); } }
public List <string> GetCategoryFrom(string webPage) { try { //<select id="subcatAbb" class="js-only"> (options are here) </select> // location of the categories //<option value="baa">baby+kids</option> // options syntax //<option value="(.*?)">(.*?)</option> // found results for this on atom //<select id="(.*?)" class="(.*?)"> // there is a match for this on atom, need to figure out how to target rest //<select id="(.*?)" class="(.*?)">(.*)</select> //this works in atom but all content must be in a single line // <select id="subcatAbb" class="(.*)">((\s*)(.*)){0,46}(\s*)</select> //this returns the selects + options ScrapeCriteria categoryCriteria = new ScrapeCriteriaBuilder() .WithData(webPage) .WithRegex(@"<select id=""subcatAbb"" class=""(.*?)"">((\s*)(.*)){0,46}(\s*)</select>") .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); Scraper scrapeCategory = new Scraper(); var categoriesHTML = scrapeCategory.Scrape(categoryCriteria); ScrapeCriteria categoryValueAndName = new ScrapeCriteriaBuilder() .WithData(categoriesHTML[0].ToString()) .WithRegex(@"<option value=""(.*)"">(.*)</option>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(@"value=""(.*)""") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithParts(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*)<") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); CategoriesFound = scrapeCategory.Scrape(categoryValueAndName); } catch (Exception ex) { Console.WriteLine("There was an error while trying to get the category: {0}", ex.Message); } return(CategoriesFound); }
static void Main(string[] args) { try { Console.WriteLine("Please enter which city you would like to scrape information from:"); string craigslistCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter which category you would like to scrape:"); string craigslistCategory = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString(""); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"<title>\s*(.+?)\s*</title>") .WithRegexOptions(RegexOptions.ExplicitCapture) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches found for the specified scrape criteria"); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Main(string[] args) { Console.WriteLine("Enter which city you would like to scrape information for"); var craigListCity = Console.ReadLine() ?? string.Empty; Console.WriteLine("Please enter the craig list category you would like to scrape"); var craigListCategoryName = Console.ReadLine() ?? string.Empty; try { using var client = new WebClient(); string content = client.DownloadString($"http://{craigListCity.Replace(" ",String.Empty)}.craigslist.org/{Method}/{craigListCategoryName}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder().WithData(content).WithRegex(MainRegex).WithRegexOption(RegexOptions.ExplicitCapture) .WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder().WithRegex(@">(.*?)</a>").WithRegexOption(RegexOptions.Singleline).Build()) .WithScrapeCriteriaPart(new ScrapeCriteriaPartBuilder().WithRegex(@"href=\""(.*?)\""").WithRegexOption(RegexOptions.Singleline).Build()) .Build(); var scrapper = new Scrapper(); var scrapedElements = scrapper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matched lines for the specified criteria"); } } catch (Exception e) { Console.WriteLine(e); } }
static void Main(string[] args) { try { // Get city and category from user Console.Write("City to scrape information for: "); string city = Console.ReadLine() ?? string.Empty; Console.Write("CraigsList category: "); string category = Console.ReadLine() ?? string.Empty; // Use WebClient to pull web page then scrape the listing URL and descriptions using (WebClient client = new WebClient()) { Console.WriteLine($"Scraping page http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}"); string content = client.DownloadString($"http://{city.Replace(" ", string.Empty)}.craigslist.org/{Method}/{category}"); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) // RegEx for entire listing element .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) // Build scraper for listing description part .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) // Build scraper for listing URL part .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); // Call scraper to extract listing elements from page then extract parts from listing elements Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); // Display scraped parts if any exists if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the entered city and category."); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } finally { Console.Write("Press any key to exit."); Console.ReadLine(); } }
static void Main(string[] args) { try { Console.WriteLine("Please enter which Udemy course URL that you would like to scrape:"); var udemyCourseURL = Console.ReadLine() ?? string.Empty; using (WebClient client = new WebClient()) { string content = client.DownloadString(udemyCourseURL); ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content).Build(); /* * string content = client.DownloadString($"http://{craigsListCity.Replace(" ", string.Empty)}.craigslist.org/{Method}/{craigsListCategoryName}"); * * ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() * .WithData(content) * .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)") * .WithRegexOption(RegexOptions.ExplicitCapture) * .WithPart(new ScrapeCriteriaPartBuilder() * .WithRegex(@">(.*?</a>") * .WithRegexOption(RegexOptions.Singleline) * .Build()) * .WithPart(new ScrapeCriteriaPartBuilder() * .WithRegex(@"href=\""(.*?)\""") * .WithRegexOption(RegexOptions.Singleline) * .Build()) * .Build(); */ string content = client.DownloadString($"https://www.udemy.com/course/learn-csharp-by-building-applications/"); using (FileStream fileStream = new FileStream("output.html", FileMode.Create)) { using (StreamWriter streamWriter = new StreamWriter(fileStream)) { streamWriter.Write(content); } } ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) .WithRegex(@"/watch\?v=(.*?)~" .Replace('~', '\"')) .WithRegexOption(RegexOptions.ExplicitCapture) .Build(); Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { Console.WriteLine(scrapedElement); } } else { Console.WriteLine("There were no matches for the specified scrape criteria."); } } } catch (Exception ex) { Console.WriteLine(ex.Message); } }
static void Main(string[] args) { try { Console.WriteLine("Enter which city you would like scrape information from"); var craigslistCity = Console.ReadLine() ?? string.Empty; System.Console.WriteLine("Please enter the Craiglist category you would like to scrape"); var craiglistCategoryName = Console.ReadLine() ?? string.Empty; // url address to download string urlAddress = $"http://{craigslistCity.Replace(" ", string.Empty)}.craiglist.org/{Method}/{craiglistCategoryName}"; // System.Console.WriteLine (urlAddress); using (WebClient client = new WebClient()) { // Download the page string content = client.DownloadString(urlAddress); // System.Console.WriteLine (content); /* * <a href="https://boston.craigslist.org/nos/cto/d/2001-ford-350-xlt/6570954158.html" data-id="6570954158" class="result-title hdrlnk">2001 Ford F 350 XLT</a> */ // Create the search criteria ScrapeCriteria scrapeCriteria = new ScrapeCriteriaBuilder() .WithData(content) // The content of the entire pade that we download .WithRegex(@"<a href=\""(.*?)\"" data-id=\""(.*?)\"" class=\""result-title hdrlnk\"">(.*?)</a>") .WithRegexOption(RegexOptions.ExplicitCapture) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@">(.*?)</a>") .WithRegexOption(RegexOptions.Singleline) .Build()) .WithPart(new ScrapeCriteriaPartBuilder() .WithRegex(@"href=\""(.*?)\""") .WithRegexOption(RegexOptions.Singleline) .Build()) .Build(); // Start scraping Scraper scraper = new Scraper(); var scrapedElements = scraper.Scrape(scrapeCriteria); if (scrapedElements.Any()) { foreach (var scrapedElement in scrapedElements) { System.Console.WriteLine(scrapedElement); } } else { System.Console.WriteLine("There were no matches for the specified scrape criteria "); } } } catch (Exception ex) { System.Console.WriteLine(ex.Message); } }