private static void ScrapeFromTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, string templateUri, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates) { // If the lock is not set to true, then scraping has begun for a new record (new property). //if (!scrapingEvent.LockOnCurrentRecord) //{ // scrapingEvent.Records.Add(new ScrapeRecord()); //} var entryUri = new Uri(templateUri); var pageResult = scrapingBrowser.NavigateToPage(entryUri); var scraperPointer = pageResult.Html; // Perform setup actions for current Template. scraperPointer = PerformTemplateSetup(scraperPointer, sourceTemplate.OrderedSetup); // If the source template is a top-level template then looping over several items is necessary. if (sourceTemplate.IsTopLevel) { Console.WriteLine($"Traveling to top-level template @ {templateUri}"); // Loop through each scrapable item and following all Scraping Steps for each Template Field to gather together the complete record. while (scraperPointer != null) { // If this record violates any disqualifiers then skip it. if (sourceTemplate.Disqualifiers.Count() != 0) { var disqualifyRecord = false; foreach (var disqualifier in sourceTemplate.Disqualifiers) { switch (disqualifier.DisqualificationType) { case DisqualificationType.RecordNodeHasClass: var classAttr = scraperPointer.Attributes["class"]; if (classAttr != null && classAttr.Value.Contains(disqualifier.Parameters)) { disqualifyRecord = true; } break; default: throw new Exception("Invalid or unknown disqualifier."); } if (disqualifyRecord) { break; } } if (disqualifyRecord) { // Move pointer to next property and continue. scraperPointer = scraperPointer.NextSibling; continue; } } // Create a record for the new property. var newRecord = new ScrapeRecord(); // Create needed dictionaries. newRecord.TargetFieldIdToValueDictionary = new Dictionary <string, string>(); newRecord.TemporaryFieldIdToValueDictionary = new Dictionary <string, string>(); // Add the new record to the current event. scrapingEvent.Records.Add(newRecord); SinglePassScrapeTemplate(scrapingBrowser, scrapingEvent, scraperPointer, sourceTemplate, allTemplates); // Move pointer to next property. scraperPointer = scraperPointer.NextSibling; } } else { Console.WriteLine($"\nTraveling to child template @ {templateUri}"); SinglePassScrapeTemplate(scrapingBrowser, scrapingEvent, scraperPointer, sourceTemplate, allTemplates); // Child templates are the last things that are handled in the scraping order. Once a child template is broken out of set the record lock to false since a new record is about to begin (nested child template depth should be irrelevant). //scrapingEvent.LockOnCurrentRecord = false; } }
private static void SinglePassScrapeTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, HtmlNode scraperPointer, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates) { var absoluteUri = scrapingBrowser.Referer.AbsoluteUri; var rootUri = absoluteUri.Substring(0, absoluteUri.IndexOf('/', 8)); foreach (var templateField in sourceTemplate.TemplateFields) { if (templateField.OrderedScrapingSteps.Count() == 0) { continue; } try { PerformScrapingForTemplateField(scraperPointer, templateField, scrapingEvent, rootUri); } catch (Exception ex) { Console.WriteLine($"\t\"{templateField.Name}\" field missing from entry."); } // Perform navigation action if the last step of a template's scraping instructions dictates it. var lastStep = templateField.OrderedScrapingSteps.Last(); if (lastStep.ActionType == ScrapingActionType.TravelToChildTemplate) { var jsonSerializer = new JavaScriptSerializer(); var travelToChildTemplateParams = new TravelToChildTemplateParams(); travelToChildTemplateParams = jsonSerializer .Deserialize <TravelToChildTemplateParams>(lastStep.Parameters); // Need child template instructions. var childTemplateId = travelToChildTemplateParams.Child; var childTemplate = allTemplates .First(t => t.TemplateId == childTemplateId && !t.IsTopLevel); int targetFieldId; var sourceIsNumericId = int.TryParse(travelToChildTemplateParams.Source, out targetFieldId); string targetUri; // The target URI was saved as a temporary field. if (travelToChildTemplateParams.IsFromTemporaryField) { targetUri = scrapingEvent.Records.Last().TemporaryFieldIdToValueDictionary[targetFieldId.ToString()]; } // The target URI is a standard, non-temporary field. else { targetUri = scrapingEvent.Records.Last().TargetFieldIdToValueDictionary[targetFieldId.ToString()]; } // Set record lock so that the same record is used. scrapingEvent.LockOnCurrentRecord = true; ScrapeFromTemplate(scrapingBrowser, scrapingEvent, rootUri + targetUri, childTemplate, allTemplates); } } }