private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job) { var result = new List <ResourceLink>(); while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read()) { if (!reader.IsStartElement()) { continue; } switch (reader.Name) { case "link": var extractionLink = ReadExtractionLinkSection(reader, config); var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>(); foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values) { linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value); } var dependencyDataSource = new DependencyDataSource( linkExtractedItems .Where(pred => pred.Value.Any(value => !value.RequiresResolve)) .ToCollectionDictionary( pred => pred.Key, pred => pred.Value .Where(value => !value.RequiresResolve) .Select(sel => sel.FormatString) ), config.PredefinedValues, job.PredefinedValues ); // NOTE: These are entry links, so they can't have any location to extract items from, only constant values var extractedLink = new AutoDetectLink( dependencyDataSource.Resolve(extractionLink.Value), extractionLink.HttpMethod, dependencyDataSource.Resolve(extractionLink.Parameters), dependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, config, job, dependencyDataSource.Resolve(linkExtractedItems), (job as CrawlingBase ?? config)?.InitializationDocumentLink ); result.Add(extractedLink); break; default: throw new ArgumentException("Unrecognized element", reader.Name); } } return(result); }
protected IEnumerable <ResourceLink> ExtractResourceLinks(ExtractionLink extractionLink) { if (ExtractedItems.ContainsKey(extractionLink.Name)) { for (var i = 0; i < ExtractedItems[extractionLink.Name].Count; i++) { var linkValue = ExtractedItems[extractionLink.Name][i]; var linkInDocumentPositionPointer = new ResponseParserPositionPointer(extractionLink.Location, i); var linkScopedExtractedItems = new CollectionDictionary <string, string>(); foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values) { ExtractItem( extractionItem, extractionLink.PredefinedExtractionItems, linkScopedExtractedItems, extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink ? linkInDocumentPositionPointer : (ResponseParserPositionPointer?)null ); } var url = linkValue; ResourceLink resourceLink; switch (extractionLink.Type) { case ExtractionLink.LinkTypes.Document: resourceLink = new DocumentLink( url, extractionLink.HttpMethod, DependencyDataSource.Resolve(extractionLink.Parameters), DependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, _documentLink.Config, _documentLink.Job, linkScopedExtractedItems, _documentLink ); break; case ExtractionLink.LinkTypes.File: resourceLink = new FileLink( url, DependencyDataSource.Resolve(extractionLink.Parameters), DependencyDataSource.Resolve(extractionLink.Headers), _documentLink.Config, _documentLink.Job, _documentLink ); break; case ExtractionLink.LinkTypes.Auto: resourceLink = new AutoDetectLink( linkValue, extractionLink.HttpMethod, DependencyDataSource.Resolve(extractionLink.Parameters), DependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, _documentLink.Config, _documentLink.Job, linkScopedExtractedItems, _documentLink ); break; default: throw new NotSupportedException(); } yield return(resourceLink); } } }