示例#1
0
        public async Task AddScrapeEvent(int templateId, ScrapeEvent scrapeEvent)
        {
            // Get the target template.
            var dbTemplate = DbContext.Templates
                             .First(t => t.TemplateId == templateId);

            // Map the object to the corresponding DAL model.
            var dbScrapeEvent = Mapper.Map <DAL.ScrapeEvent>(scrapeEvent);

            // Add the mapped event to the database and save the changes.
            dbTemplate.ScrapeEvents.Add(dbScrapeEvent);
            await DbContext.SaveChangesAsync();
        }
示例#2
0
        public static async Task ScrapeAllSources()
        {
            //var dataSources = await SourceClient.GetSourcesAsync();
            var areasOfConcern = await AreaClient.GetAreasAsync();

            var allTemplates = await TemplateClient.GetTemplatesAsync();

            var topLevelBrowser = new ScrapingBrowser();

            // Dunno what these do but the tutorial had them.
            //topLevelBrowser.AllowAutoRedirect = true;
            //topLevelBrowser.AllowMetaRedirect = true;

            foreach (var area in areasOfConcern)
            {
                foreach (var entryPoint in area.EntryPoints)
                {
                    // Get a scraping Template for the Source associated with the current Entry Point.
                    var topLevelTemplate = allTemplates
                                           .First(t => t.SourceId == entryPoint.SourceId && t.IsTopLevel);

                    // Create new scraping event and assign the area ID.
                    var scrapingEvent = new ScrapeEvent();
                    scrapingEvent.AreaId = area.AreaId;

                    // Create the record collection.
                    scrapingEvent.Records = new System.Collections.ObjectModel.ObservableCollection <ScrapeRecord>();

                    // Begin scraping from the top-level Template.
                    ScrapeFromTemplate(topLevelBrowser, scrapingEvent, entryPoint.EntryPointUri, topLevelTemplate, allTemplates);

                    // Send scraping event to the WebServices to be added to the database.
                    await TemplateClient.AddScrapeEventAsync(topLevelTemplate.TemplateId, scrapingEvent);
                }
            }
        }
示例#3
0
        private static void ScrapeFromTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, string templateUri, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates)
        {
            // If the lock is not set to true, then scraping has begun for a new record (new property).
            //if (!scrapingEvent.LockOnCurrentRecord)
            //{
            //    scrapingEvent.Records.Add(new ScrapeRecord());
            //}

            var entryUri       = new Uri(templateUri);
            var pageResult     = scrapingBrowser.NavigateToPage(entryUri);
            var scraperPointer = pageResult.Html;

            // Perform setup actions for current Template.
            scraperPointer = PerformTemplateSetup(scraperPointer, sourceTemplate.OrderedSetup);

            // If the source template is a top-level template then looping over several items is necessary.
            if (sourceTemplate.IsTopLevel)
            {
                Console.WriteLine($"Traveling to top-level template @ {templateUri}");

                // Loop through each scrapable item and following all Scraping Steps for each Template Field to gather together the complete record.
                while (scraperPointer != null)
                {
                    // If this record violates any disqualifiers then skip it.
                    if (sourceTemplate.Disqualifiers.Count() != 0)
                    {
                        var disqualifyRecord = false;

                        foreach (var disqualifier in sourceTemplate.Disqualifiers)
                        {
                            switch (disqualifier.DisqualificationType)
                            {
                            case DisqualificationType.RecordNodeHasClass:
                                var classAttr = scraperPointer.Attributes["class"];

                                if (classAttr != null && classAttr.Value.Contains(disqualifier.Parameters))
                                {
                                    disqualifyRecord = true;
                                }
                                break;

                            default:
                                throw new Exception("Invalid or unknown disqualifier.");
                            }

                            if (disqualifyRecord)
                            {
                                break;
                            }
                        }

                        if (disqualifyRecord)
                        {
                            // Move pointer to next property and continue.
                            scraperPointer = scraperPointer.NextSibling;
                            continue;
                        }
                    }

                    // Create a record for the new property.
                    var newRecord = new ScrapeRecord();

                    // Create needed dictionaries.
                    newRecord.TargetFieldIdToValueDictionary    = new Dictionary <string, string>();
                    newRecord.TemporaryFieldIdToValueDictionary = new Dictionary <string, string>();

                    // Add the new record to the current event.
                    scrapingEvent.Records.Add(newRecord);

                    SinglePassScrapeTemplate(scrapingBrowser, scrapingEvent, scraperPointer, sourceTemplate, allTemplates);

                    // Move pointer to next property.
                    scraperPointer = scraperPointer.NextSibling;
                }
            }
            else
            {
                Console.WriteLine($"\nTraveling to child template @ {templateUri}");

                SinglePassScrapeTemplate(scrapingBrowser, scrapingEvent, scraperPointer, sourceTemplate, allTemplates);

                // Child templates are the last things that are handled in the scraping order.  Once a child template is broken out of set the record lock to false since a new record is about to begin (nested child template depth should be irrelevant).
                //scrapingEvent.LockOnCurrentRecord = false;
            }
        }
示例#4
0
        private static void PerformScrapingForTemplateField(HtmlNode pointer, TargetTemplateField templateField, ScrapeEvent scrapeEvent, string rootUri)
        {
            string targetValue    = null;
            var    jsonSerializer = new JavaScriptSerializer();

            foreach (var setupStep in templateField.OrderedScrapingSteps)
            {
                switch (setupStep.ActionType)
                {
                case ScrapingActionType.CssSelect:
                    pointer = pointer
                              .CssSelect(setupStep.Parameters)
                              .First();
                    break;

                case ScrapingActionType.FirstDescendantNodeOfType:
                    pointer = pointer
                              .Descendants()
                              .First(c => c.Name == setupStep.Parameters);
                    break;

                case ScrapingActionType.FirstDescendantNodeOfTypeAttrsVals:

                    var firstDescendantNodeOfTypeAttrsValsParams = new FirstDescendantNodeOfTypeAttrsValsParams();
                    firstDescendantNodeOfTypeAttrsValsParams = jsonSerializer
                                                               .Deserialize <FirstDescendantNodeOfTypeAttrsValsParams>(setupStep.Parameters);

                    var possibleChoices = pointer
                                          .Descendants()
                                          .Where(n => n.Name == firstDescendantNodeOfTypeAttrsValsParams.Node);

                    // All attribute-value pairs must be present.
                    for (var i = 0; i < firstDescendantNodeOfTypeAttrsValsParams.Attributes.Count(); i++)
                    {
                        var attribute          = firstDescendantNodeOfTypeAttrsValsParams.Attributes[i];
                        var correspondingValue = firstDescendantNodeOfTypeAttrsValsParams.Values[i];

                        possibleChoices = possibleChoices
                                          .Where(n =>
                                                 n.Attributes.Contains(attribute) &&
                                                 n.Attributes[attribute].Value == correspondingValue);
                    }

                    // If a depth value is given, grab the Nth descendant.  Otherwise, grab the first.
                    if (firstDescendantNodeOfTypeAttrsValsParams.Depth.HasValue)
                    {
                        pointer = possibleChoices.ElementAt(firstDescendantNodeOfTypeAttrsValsParams.Depth.Value - 1);
                    }
                    else
                    {
                        pointer = possibleChoices.First();
                    }

                    break;

                case ScrapingActionType.NthDescendantNodeOfType:
                    break;

                case ScrapingActionType.SelectInnerText:
                    targetValue = pointer.InnerText;
                    break;

                case ScrapingActionType.SelectAttributeValue:
                    targetValue = pointer.Attributes[setupStep.Parameters].Value;
                    break;

                //case ScrapingActionType.SelectInnerTextTemporaryValue:
                //    temporaryValueKey = setupStep.Parameters;
                //    targetValue = pointer.InnerText;
                //    break;

                // Perform no action - navigation can't ocur in a static method.
                case ScrapingActionType.TravelToChildTemplate:
                    break;

                case ScrapingActionType.TemporaryTravelToDerivedLocation:

                    var temporaryBrowser = new ScrapingBrowser();

                    var entryUri   = new Uri(rootUri + targetValue);
                    var pageResult = temporaryBrowser.NavigateToPage(entryUri);
                    pointer = pageResult.Html;

                    break;

                case ScrapingActionType.SplitData:

                    var splitDataParams = new SplitDataParams();
                    splitDataParams = jsonSerializer
                                      .Deserialize <SplitDataParams>(setupStep.Parameters);

                    var location = pointer.InnerText.IndexOf(splitDataParams.Split);

                    for (var i = 0; i < splitDataParams.Index; i++)
                    {
                        location = pointer.InnerText.IndexOf(splitDataParams.Split, location);
                    }

                    //var splitSubstring = pointer.InnerText.Substring()

                    break;

                case ScrapingActionType.ReplaceInstancesOf:

                    var replaceInstancesOfParams = new ReplaceInstancesOfParams();
                    replaceInstancesOfParams = jsonSerializer
                                               .Deserialize <ReplaceInstancesOfParams>(setupStep.Parameters);

                    targetValue = targetValue.Replace(replaceInstancesOfParams.Replace, replaceInstancesOfParams.With);

                    break;

                case ScrapingActionType.RemoveOutsideOfBordersExclusive:

                    var removeOutsideOfBordersParams = new RemoveOutsideOfBordersParams();
                    removeOutsideOfBordersParams = jsonSerializer
                                                   .Deserialize <RemoveOutsideOfBordersParams>(setupStep.Parameters);

                    var left    = removeOutsideOfBordersParams.Left;
                    var right   = removeOutsideOfBordersParams.Right;
                    var leftLen = left.Length;

                    var indexOfRight = targetValue.IndexOf(right);
                    var indexOfLeft  = targetValue.LastIndexOf(left, indexOfRight);

                    targetValue = targetValue.Substring(indexOfLeft + left.Length, indexOfRight - indexOfLeft - leftLen);

                    break;

                case ScrapingActionType.Trim:

                    targetValue = targetValue.Trim();
                    break;

                case ScrapingActionType.NodeWithInnerTextSubstring:

                    var nodeWithInnerTextSubstringParams = new NodeWithInnerTextSubstringParams();
                    nodeWithInnerTextSubstringParams = jsonSerializer
                                                       .Deserialize <NodeWithInnerTextSubstringParams>(setupStep.Parameters);

                    pointer = pointer
                              .Descendants()
                              .First(n => n.Name == nodeWithInnerTextSubstringParams.Node &&
                                     n.InnerText.Contains(nodeWithInnerTextSubstringParams.Substring));

                    break;

                default:
                    throw new Exception("Invalid or unknown setup step.");
                }
            }

            var scrapeRecord = scrapeEvent.Records.Last();

            // Add recovered value to the scrape event field value dictionary.
            var stringTempFieldId = templateField.TargetFieldId.ToString(); // .TemplateFieldId.Value

            if (targetValue != null)
            {
                if (templateField.IsTemporaryField)
                {
                    scrapeRecord.TemporaryFieldIdToValueDictionary.Add(stringTempFieldId, targetValue);
                }
                else
                {
                    scrapeRecord.TargetFieldIdToValueDictionary.Add(stringTempFieldId, targetValue);
                }
            }
        }
示例#5
0
        private static void SinglePassScrapeTemplate(ScrapingBrowser scrapingBrowser, ScrapeEvent scrapingEvent, HtmlNode scraperPointer, TemplateInstructions sourceTemplate, System.Collections.ObjectModel.ObservableCollection <TemplateInstructions> allTemplates)
        {
            var absoluteUri = scrapingBrowser.Referer.AbsoluteUri;
            var rootUri     = absoluteUri.Substring(0, absoluteUri.IndexOf('/', 8));

            foreach (var templateField in sourceTemplate.TemplateFields)
            {
                if (templateField.OrderedScrapingSteps.Count() == 0)
                {
                    continue;
                }

                try
                {
                    PerformScrapingForTemplateField(scraperPointer, templateField, scrapingEvent, rootUri);
                }
                catch (Exception ex)
                {
                    Console.WriteLine($"\t\"{templateField.Name}\" field missing from entry.");
                }

                // Perform navigation action if the last step of a template's scraping instructions dictates it.
                var lastStep = templateField.OrderedScrapingSteps.Last();
                if (lastStep.ActionType == ScrapingActionType.TravelToChildTemplate)
                {
                    var jsonSerializer = new JavaScriptSerializer();

                    var travelToChildTemplateParams = new TravelToChildTemplateParams();
                    travelToChildTemplateParams = jsonSerializer
                                                  .Deserialize <TravelToChildTemplateParams>(lastStep.Parameters);

                    // Need child template instructions.
                    var childTemplateId = travelToChildTemplateParams.Child;
                    var childTemplate   = allTemplates
                                          .First(t => t.TemplateId == childTemplateId && !t.IsTopLevel);

                    int    targetFieldId;
                    var    sourceIsNumericId = int.TryParse(travelToChildTemplateParams.Source, out targetFieldId);
                    string targetUri;

                    // The target URI was saved as a temporary field.
                    if (travelToChildTemplateParams.IsFromTemporaryField)
                    {
                        targetUri = scrapingEvent.Records.Last().TemporaryFieldIdToValueDictionary[targetFieldId.ToString()];
                    }

                    // The target URI is a standard, non-temporary field.
                    else
                    {
                        targetUri = scrapingEvent.Records.Last().TargetFieldIdToValueDictionary[targetFieldId.ToString()];
                    }

                    // Set record lock so that the same record is used.
                    scrapingEvent.LockOnCurrentRecord = true;

                    ScrapeFromTemplate(scrapingBrowser, scrapingEvent, rootUri + targetUri, childTemplate, allTemplates);
                }
            }
        }
示例#6
0
 public async Task AddScrapeEvent(int templateId, [FromBody] ScrapeEvent scrapeEvent)
 {
     await TempSvc.AddScrapeEvent(templateId, scrapeEvent);
 }