public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrapedElements = new List <string>(); // Find all matches for specified RegEx MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); // Loop through found matches to extract smaller parts if specified foreach (Match match in matches) { // If no parts exist to drill into, add the scraped elements // Else loop through list of parts to extract parts of the matched elements if (!scrapeCriteria.Parts.Any()) { scrapedElements.Add(match.Groups[0].Value); } else { // Loop through each part and scrape matching RegEx from parent match element foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[0].Value); } } } } return(scrapedElements); }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrapedElements = new List <string>(); MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); foreach (Match match in matches) { if (scrapeCriteria.Parts.Count.Equals(0)) { // there are no parts to check, results are final scrapedElements.Add(match.Groups[0].Value); } else { // Some parts to distill, so let's find'em foreach (ScrapeCriteriaPart part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrapedElements); }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrapedElements = new List <string>(); // this is what we return //begin matching MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); foreach (Match match in matches) { if (!scrapeCriteria.Parts.Any()) // if we dont need to go deeper { scrapedElements.Add(match.Groups[0].Value); // } else // going deeper and grabbing certain parts of element { foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrapedElements); }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scraperElements = new List <string>(); MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); foreach (Match match in matches) { if (!scrapeCriteria.Parts.Any()) { scraperElements.Add(match.Groups[0].Value); } else { foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scraperElements.Add(matchedPart.Groups[1].Value); } } } } return(scraperElements); }
public List <string> Scrape(ScrapeCriteria ScrapeCriteria) // method takes the ScrapeCriteria & returns scraped list { List <string> scrapedElements = new List <string>(); // Perform scraping MatchCollection matches = Regex.Matches(ScrapeCriteria.Data, ScrapeCriteria.Regex, ScrapeCriteria.RegExOption); // input, regex, regexOption foreach (Match match in matches) { if (!ScrapeCriteria.Parts.Any()) // no parts specified in this element { scrapedElements.Add(match.Groups[0].Value); // adding value of matched element to list } else // parts specified for this element { foreach (var part in ScrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrapedElements); // as list of strings }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrapedElements = new List <string>(); MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); //match.Groups[0].Value is the first level of the match foreach (Match match in matches) { //1 level of detail if (!scrapeCriteria.Parts.Any()) { scrapedElements.Add(match.Groups[0].Value); } //Deeper level of the match else { foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrapedElements); }
//This us the builders to scrape by matching on regex //Takes in a list of scrape criteria as strings and returns a list of scraped elements //Use regular expression engine Regex to match public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrapedElements = new List <string>(); //Takes 3 args needed to match, the data, the regex, and optional options MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); //After machting complete, iterate through list to decide whether to add to the returned list //If the match doesnt have parts to go through, then just add to the list //If there is sub part in match, then iterate and match using regex and then add to list //finally return the complete list foreach (Match match in matches) { if (!scrapeCriteria.Parts.Any()) { scrapedElements.Add(match.Groups[0].Value); } else { foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.Regexoption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrapedElements); }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrappedElements = new List <string>(); MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); foreach (Match match in matches) { if (!scrapeCriteria.Parts.Any()) { scrappedElements.Add(match.Groups[0].Value); } else { foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOptions); // [0] {>2007 Toyota Highlander very low miles</a>} // [1] {2007 Toyota Highlander verly low miles} <--- Takes the group out if (matchedPart.Success) { scrappedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrappedElements); }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { List <string> scrapedElements = new List <string>(); //Matches method takes an data and searches for a specified pattern (Regex), and RegexOption MatchCollection matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOption); foreach (Match match in matches) { if (!scrapeCriteria.Parts.Any()) { scrapedElements.Add(match.Groups[0].Value); } else { foreach (var part in scrapeCriteria.Parts) { Match matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOption); if (matchedPart.Success) { scrapedElements.Add(matchedPart.Groups[1].Value); } } } } return(scrapedElements); }
public ScrapeCriteria Build() { ScrapeCriteria scrapeCriteria = new ScrapeCriteria(); scrapeCriteria.Data = _data; scrapeCriteria.Regex = _regex; scrapeCriteria.RegexOption = _regexOptions; scrapeCriteria.Parts = _parts; return(scrapeCriteria); }
public ScrapeCriteria Build() { ScrapeCriteria scrapeCriteria = new ScrapeCriteria(); scrapeCriteria.Data = data; scrapeCriteria.Regex = regex; scrapeCriteria.RegexOption = regexOption; scrapeCriteria.Parts = parts; return(scrapeCriteria); }
//create a method returing a list of scrapes //add parameters to the method public List <string> Scrape(ScrapeCriteria scrapeCriteria) { //make List of string to return scrapped elements List <string> scrappedElements = new List <string>(); //make a body for the scrapping operation //return the list return(scrappedElements); }
public ScrapeCriteria Build() { ScrapeCriteria scrape = new ScrapeCriteria { Data = _data, Regex = _regex, RegexOptions = _regexOption, parts = _parts }; return(scrape); }
public ScrapeCriteria Build() { var scrapeCriteria = new ScrapeCriteria { Data = _data, Regex = _regex, RegexOption = _regexOption, Parts = _parts }; return(scrapeCriteria); }
public static List <VideoItem> scrapeYoutube(String searchQuery) { List <VideoItem> results = new List <VideoItem>(); using (WebClient webClient = new WebClient()) { string url; if (!searchQuery.Contains(' ') && searchQuery.Length != 11) { url = urlBuilder(searchQuery); } else { url = "https://www.youtube.com/results?search_query=" + searchQuery; } string data; try { data = webClient.DownloadString(url); } catch (Exception ex) { throw ex; } //webClient.DownloadFile(url , "result"); //string data = File.ReadAllText("result"); ScrapeCriteria scrapeCriteria = new ScrapeCriteria(data, Constants.pattern); MatchCollection matches = scraper.scrape(scrapeCriteria); foreach (Match element in matches) { Match match = Regex.Match(element.Groups[0].Value, Constants.pattern, RegexOptions.Singleline); if (match.Success) { if (match.Groups[1].Value.Length == 11) { results.Add(new VideoItem(match.Groups[2].Value, match.Groups[1].Value)); } } } } return(results); }
private void ScrapeByRegex(ScrapeCriteria scrapeCriteria, List <string> scrappedElements, Match match) { if (!scrapeCriteria.Parts.Any()) { scrappedElements.Add(match.Groups[0].Value); } else { foreach (var part in scrapeCriteria.Parts) { var matchedPart = Regex.Match(match.Groups[0].Value, part.Regex, part.RegexOptions); if (matchedPart.Success) { scrappedElements.Add(matchedPart.Groups[1].Value); } } } }
public List <string> Scrape(ScrapeCriteria scrapeCriteria) { try { var scrappedElements = new List <string>(); var matches = Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.RegexOptions); foreach (Match match in matches) { ScrapeByRegex(scrapeCriteria, scrappedElements, match); } return(scrappedElements); } catch (Exception e) { Logger.Error(e.Message); throw; } }
public MatchCollection scrape(ScrapeCriteria scrapeCriteria) { return(Regex.Matches(scrapeCriteria.Data, scrapeCriteria.Regex, scrapeCriteria.regexOptions)); }