コード例 #1
0
        private static ExtractionLink ReadExtractionLinkSection(XmlReader reader, WebsiteConfig config)
        {
            var extractionLink = new ExtractionLink();

            ReadExtractionLinkSection(reader, config, extractionLink);

            return(extractionLink);
        }
コード例 #2
0
ファイル: DocumentProcessor.cs プロジェクト: rezgar/crawler
        protected IEnumerable <ResourceLink> ExtractResourceLinks(ExtractionLink extractionLink)
        {
            if (ExtractedItems.ContainsKey(extractionLink.Name))
            {
                for (var i = 0; i < ExtractedItems[extractionLink.Name].Count; i++)
                {
                    var linkValue = ExtractedItems[extractionLink.Name][i];
                    var linkInDocumentPositionPointer = new ResponseParserPositionPointer(extractionLink.Location, i);

                    var linkScopedExtractedItems = new CollectionDictionary <string, string>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        ExtractItem(
                            extractionItem,
                            extractionLink.PredefinedExtractionItems,
                            linkScopedExtractedItems,

                            extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink
                                ? linkInDocumentPositionPointer
                                : (ResponseParserPositionPointer?)null
                            );
                    }

                    var url = linkValue;

                    ResourceLink resourceLink;
                    switch (extractionLink.Type)
                    {
                    case ExtractionLink.LinkTypes.Document:
                        resourceLink = new DocumentLink(
                            url,
                            extractionLink.HttpMethod,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            extractionLink.ExtractLinks,
                            extractionLink.ExtractData,
                            _documentLink.Config,
                            _documentLink.Job,
                            linkScopedExtractedItems,
                            _documentLink
                            );
                        break;

                    case ExtractionLink.LinkTypes.File:
                        resourceLink = new FileLink(
                            url,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            _documentLink.Config,
                            _documentLink.Job,
                            _documentLink
                            );
                        break;

                    case ExtractionLink.LinkTypes.Auto:
                        resourceLink = new AutoDetectLink(
                            linkValue,
                            extractionLink.HttpMethod,
                            DependencyDataSource.Resolve(extractionLink.Parameters),
                            DependencyDataSource.Resolve(extractionLink.Headers),
                            extractionLink.ExtractLinks,
                            extractionLink.ExtractData,
                            _documentLink.Config,
                            _documentLink.Job,
                            linkScopedExtractedItems,
                            _documentLink
                            );
                        break;

                    default:
                        throw new NotSupportedException();
                    }

                    yield return(resourceLink);
                }
            }
        }
コード例 #3
0
ファイル: DocumentProcessor.cs プロジェクト: rezgar/crawler
 protected void ExtractLink(ExtractionLink extractionLink)
 {
     ExtractedLinks.AddValues(extractionLink.Name, ExtractResourceLinks(extractionLink));
 }
コード例 #4
0
        private static void ReadExtractionLinkSection(XmlReader reader, WebsiteConfig config, ExtractionLink extractionLink)
        {
            ReadExtractionItemAttributes(extractionLink, reader, config);
            extractionLink.ExtractLinks = reader.GetAttribute("extract_links", extractionLink.ExtractLinks);
            extractionLink.ExtractData  = reader.GetAttribute("extract_data", extractionLink.ExtractData);
            extractionLink.HttpMethod   = reader.GetAttribute <string>("method", extractionLink.HttpMethod);
            extractionLink.Type         = reader.GetAttribute("type", ExtractionLink.LinkTypes.Auto);
            extractionLink.DependsOn    = reader.GetAttribute <string>("depends_on", null)?.Split(new[] { ',', ';', '|' }, StringSplitOptions.RemoveEmptyEntries);

            reader.ProcessChildren((childName, childReader) =>
            {
                switch (childName)
                {
                case "predefined_items":
                    extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink = childReader.GetAttribute("relative", extractionLink.IsPredefinedExtractionItemsLocationRelativeToLink);
                    extractionLink.PredefinedExtractionItems = ReadExtractionItemsSection(childReader, config);
                    break;

                case "parameters":
                    extractionLink.Parameters = ReadExtractionLinkParametersSection(childReader);
                    break;

                case "headers":
                    extractionLink.Headers = ReadHttpHeadersSection(childReader);
                    break;

                case "post_processors":
                    ReadExtractionItemPostProcessors(extractionLink, childReader);
                    break;
                }
            });
        }