Exemplo n.º 1
0
        private static void ScrapeFromTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, string templateUri, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates)
        {
            // If the lock is not set to true, then scraping has begun for a new record (new property).
            //if (!scrapingEvent.LockOnCurrentRecord)
            //{
            //    scrapingEvent.Records.Add(new ScrapeRecord());
            //}

            var entryUri       = new Uri(templateUri);
            var pageResult     = scrapingBrowser.NavigateToPage(entryUri);
            var scraperPointer = pageResult.Html;

            // Perform setup actions for current Template.
            scraperPointer = PerformTemplateSetup(scraperPointer, sourceTemplate.OrderedSetup);

            // If the source template is a top-level template then looping over several items is necessary.
            if (sourceTemplate.IsTopLevel)
            {
                Console.WriteLine($"Traveling to top-level template @ {templateUri}");

                // Loop through each scrapable item and following all Scraping Steps for each Template Field to gather together the complete record.
                while (scraperPointer != null)
                {
                    // If this record violates any disqualifiers then skip it.
                    if (sourceTemplate.Disqualifiers.Count() != 0)
                    {
                        var disqualifyRecord = false;

                        foreach (var disqualifier in sourceTemplate.Disqualifiers)
                        {
                            switch (disqualifier.DisqualificationType)
                            {
                            case DisqualificationType.RecordNodeHasClass:
                                var classAttr = scraperPointer.Attributes["class"];

                                if (classAttr != null && classAttr.Value.Contains(disqualifier.Parameters))
                                {
                                    disqualifyRecord = true;
                                }
                                break;

                            default:
                                throw new Exception("Invalid or unknown disqualifier.");
                            }

                            if (disqualifyRecord)
                            {
                                break;
                            }
                        }

                        if (disqualifyRecord)
                        {
                            // Move pointer to next property and continue.
                            scraperPointer = scraperPointer.NextSibling;
                            continue;
                        }
                    }

                    // Create a record for the new property.
                    var newRecord = new ScrapeRecord();

                    // Create needed dictionaries.
                    newRecord.TargetFieldIdToValueDictionary    = new Dictionary <string, string>();
                    newRecord.TemporaryFieldIdToValueDictionary = new Dictionary <string, string>();

                    // Add the new record to the current event.
                    scrapingEvent.Records.Add(newRecord);

                    SinglePassScrapeTemplate(scrapingBrowser, scrapingEvent, scraperPointer, sourceTemplate, allTemplates);

                    // Move pointer to next property.
                    scraperPointer = scraperPointer.NextSibling;
                }
            }
            else
            {
                Console.WriteLine($"\nTraveling to child template @ {templateUri}");

                SinglePassScrapeTemplate(scrapingBrowser, scrapingEvent, scraperPointer, sourceTemplate, allTemplates);

                // Child templates are the last things that are handled in the scraping order.  Once a child template is broken out of set the record lock to false since a new record is about to begin (nested child template depth should be irrelevant).
                //scrapingEvent.LockOnCurrentRecord = false;
            }
        }
Exemplo n.º 2
0
        private static void SinglePassScrapeTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, HtmlNode scraperPointer, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates)
        {
            var absoluteUri = scrapingBrowser.Referer.AbsoluteUri;
            var rootUri     = absoluteUri.Substring(0, absoluteUri.IndexOf('/', 8));

            foreach (var templateField in sourceTemplate.TemplateFields)
            {
                if (templateField.OrderedScrapingSteps.Count() == 0)
                {
                    continue;
                }

                try
                {
                    PerformScrapingForTemplateField(scraperPointer, templateField, scrapingEvent, rootUri);
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"\t\"{templateField.Name}\" field missing from entry.");
                }

                // Perform navigation action if the last step of a template's scraping instructions dictates it.
                var lastStep = templateField.OrderedScrapingSteps.Last();
                if (lastStep.ActionType == ScrapingActionType.TravelToChildTemplate)
                {
                    var jsonSerializer = new JavaScriptSerializer();

                    var travelToChildTemplateParams = new TravelToChildTemplateParams();
                    travelToChildTemplateParams = jsonSerializer
                                                  .Deserialize <TravelToChildTemplateParams>(lastStep.Parameters);

                    // Need child template instructions.
                    var childTemplateId = travelToChildTemplateParams.Child;
                    var childTemplate   = allTemplates
                                          .First(t => t.TemplateId == childTemplateId && !t.IsTopLevel);

                    int    targetFieldId;
                    var    sourceIsNumericId = int.TryParse(travelToChildTemplateParams.Source, out targetFieldId);
                    string targetUri;

                    // The target URI was saved as a temporary field.
                    if (travelToChildTemplateParams.IsFromTemporaryField)
                    {
                        targetUri = scrapingEvent.Records.Last().TemporaryFieldIdToValueDictionary[targetFieldId.ToString()];
                    }

                    // The target URI is a standard, non-temporary field.
                    else
                    {
                        targetUri = scrapingEvent.Records.Last().TargetFieldIdToValueDictionary[targetFieldId.ToString()];
                    }

                    // Set record lock so that the same record is used.
                    scrapingEvent.LockOnCurrentRecord = true;

                    ScrapeFromTemplate(scrapingBrowser, scrapingEvent, rootUri + targetUri, childTemplate, allTemplates);
                }
            }
        }