Esempio n. 1
0
        private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job)
        {
            var result = new List <ResourceLink>();

            while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "link":
                    var extractionLink     = ReadExtractionLinkSection(reader, config);
                    var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value);
                    }

                    var dependencyDataSource = new DependencyDataSource(
                        linkExtractedItems
                        .Where(pred => pred.Value.Any(value => !value.RequiresResolve))
                        .ToCollectionDictionary(
                            pred => pred.Key,
                            pred => pred.Value
                            .Where(value => !value.RequiresResolve)
                            .Select(sel => sel.FormatString)
                            ),
                        config.PredefinedValues,
                        job.PredefinedValues
                        );

                    // NOTE: These are entry links, so they can't have any location to extract items from, only constant values
                    var extractedLink = new AutoDetectLink(
                        dependencyDataSource.Resolve(extractionLink.Value),
                        extractionLink.HttpMethod,
                        dependencyDataSource.Resolve(extractionLink.Parameters),
                        dependencyDataSource.Resolve(extractionLink.Headers),
                        extractionLink.ExtractLinks,
                        extractionLink.ExtractData,
                        config,
                        job,
                        dependencyDataSource.Resolve(linkExtractedItems),
                        (job as CrawlingBase ?? config)?.InitializationDocumentLink
                        );

                    result.Add(extractedLink);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(result);
        }
Esempio n. 2
0
        protected IEnumerable <ResourceLink> ExtractResourceLinks(ExtractionLink extractionLink)
        {
            if (ExtractedItems.ContainsKey(extractionLink.Name))
            {
                for (var i = 0; i < ExtractedItems[extractionLink.Name].Count; i++)
                {
                    var linkValue = ExtractedItems[extractionLink.Name][i];
                    var linkInDocumentPositionPointer = new ResponseParserPositionPointer(extractionLink.Location, i);

                    var linkScopedExtractedItems = new CollectionDictionary <string, string>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        ExtractItem(
                            extractionItem,
                            extractionLink.PredefinedExtractionItems,
                            linkScopedExtractedItems,

                            extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink
                                ? linkInDocumentPositionPointer
                                : (ResponseParserPositionPointer?)null
                            );
                    }

                    var url = linkValue;

                    ResourceLink resourceLink;
                    switch (extractionLink.Type)
                    {
                    case ExtractionLink.LinkTypes.Document:
                        resourceLink = new DocumentLink(
                            url,
                            extractionLink.HttpMethod,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            extractionLink.ExtractLinks,
                            extractionLink.ExtractData,
                            _documentLink.Config,
                            _documentLink.Job,
                            linkScopedExtractedItems,
                            _documentLink
                            );
                        break;

                    case ExtractionLink.LinkTypes.File:
                        resourceLink = new FileLink(
                            url,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            _documentLink.Config,
                            _documentLink.Job,
                            _documentLink
                            );
                        break;

                    case ExtractionLink.LinkTypes.Auto:
                        resourceLink = new AutoDetectLink(
                            linkValue,
                            extractionLink.HttpMethod,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            extractionLink.ExtractLinks,
                            extractionLink.ExtractData,
                            _documentLink.Config,
                            _documentLink.Job,
                            linkScopedExtractedItems,
                            _documentLink
                            );
                        break;

                    default:
                        throw new NotSupportedException();
                    }

                    yield return(resourceLink);
                }
            }
        }