Exemple #1
0
        private static WebsiteJob ReadWebsiteJobNode(XmlTextReader reader, WebsiteConfig config)
        {
            var job = new WebsiteJob(config);

            job.Name = reader.GetAttribute <string>("name", job.Name);
            var holderNodeName = reader.Name;

            while (!(reader.Name == holderNodeName && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "initialization":
                    job.InitializationDocumentLink = ReadInitializationDocumentSection(reader, config, job);
                    break;

                case "entry":
                    job.EntryLinks = ReadEntryLinksSection(reader, config, job);
                    break;

                case "dictionary":
                    job.PredefinedValues = ReadPredefinedValuesSection(reader, config);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(job);
        }
Exemple #2
0
 public ResourceLink(
     StringWithDependencies urlWithDependencies,
     string httpMethod,
     IDictionary <string, StringWithDependencies> parametersWithDependencies,
     IDictionary <string, StringWithDependencies> headersWithDependencies,
     WebsiteConfig config,
     WebsiteJob job,
     ResourceLink referrerResourceLink
     ) : this(null, httpMethod, null, null, config, job, referrerResourceLink)
 {
     UrlWithDependencies        = urlWithDependencies;
     ParametersWithDependencies = parametersWithDependencies;
     HeadersWithDependencies    = headersWithDependencies;
 }
Exemple #3
0
 public InitializationLink(
     StringWithDependencies urlWithDependencies,
     string httpMethod,
     IDictionary <string, StringWithDependencies> parametersWithDependencies,
     IDictionary <string, StringWithDependencies> headersWithDependencies,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies = null
     )
     : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, extractLinks, extractData, config, job, preExtractedItemsWithDependencies, null)
 {
 }
Exemple #4
0
 public InitializationLink(
     string url,
     string httpMethod,
     IDictionary <string, string> parameters,
     IDictionary <string, string> headers,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, string> preExtractedItems = null
     )
     : base(url, httpMethod, parameters, headers, extractLinks, extractData, config, job, preExtractedItems, null)
 {
 }
Exemple #5
0
 public DocumentLink(
     StringWithDependencies urlWithDependencies,
     string httpMethod,
     IDictionary <string, StringWithDependencies> parametersWithDependencies,
     IDictionary <string, StringWithDependencies> headersWithDependencies,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies,
     DocumentLink referrerDocumentLink = null
     )
     : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, config, job, referrerDocumentLink)
 {
     ExtractLinks = extractLinks;
     ExtractData  = extractData;
     PreExtractedItemsWithDependencies = preExtractedItemsWithDependencies;
 }
Exemple #6
0
 public ResourceLink(
     string url,
     string httpMethod,
     IDictionary <string, string> parameters,
     IDictionary <string, string> headers,
     WebsiteConfig config,
     WebsiteJob job,
     ResourceLink referrerResourceLink
     )
 {
     Url                  = url;
     HttpMethod           = httpMethod;
     Config               = config;
     Job                  = job;
     Parameters           = parameters;
     Headers              = headers;
     ReferrerResourceLink = referrerResourceLink;
     UserAgent            = referrerResourceLink?.UserAgent ?? config.CrawlingSettings.UserAgents.GetRandomElement();
 }
Exemple #7
0
 public DocumentLink
 (
     string url,
     string httpMethod,
     IDictionary <string, string> parameters,
     IDictionary <string, string> headers,
     bool extractLinks,
     bool extractData,
     WebsiteConfig config,
     WebsiteJob job,
     CollectionDictionary <string, string> preExtractedItems,
     DocumentLink referrerDocumentLink = null
 )
     : base(url, httpMethod, parameters, headers, config, job, referrerDocumentLink)
 {
     ExtractLinks      = extractLinks;
     ExtractData       = extractData;
     PreExtractedItems = preExtractedItems;
 }
Exemple #8
0
        private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job)
        {
            var result = new List <ResourceLink>();

            while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "link":
                    var extractionLink     = ReadExtractionLinkSection(reader, config);
                    var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>();
                    foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values)
                    {
                        linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value);
                    }

                    var dependencyDataSource = new DependencyDataSource(
                        linkExtractedItems
                        .Where(pred => pred.Value.Any(value => !value.RequiresResolve))
                        .ToCollectionDictionary(
                            pred => pred.Key,
                            pred => pred.Value
                            .Where(value => !value.RequiresResolve)
                            .Select(sel => sel.FormatString)
                            ),
                        config.PredefinedValues,
                        job.PredefinedValues
                        );

                    // NOTE: These are entry links, so they can't have any location to extract items from, only constant values
                    var extractedLink = new AutoDetectLink(
                        dependencyDataSource.Resolve(extractionLink.Value),
                        extractionLink.HttpMethod,
                        dependencyDataSource.Resolve(extractionLink.Parameters),
                        dependencyDataSource.Resolve(extractionLink.Headers),
                        extractionLink.ExtractLinks,
                        extractionLink.ExtractData,
                        config,
                        job,
                        dependencyDataSource.Resolve(linkExtractedItems),
                        (job as CrawlingBase ?? config)?.InitializationDocumentLink
                        );

                    result.Add(extractedLink);
                    break;

                default:
                    throw new ArgumentException("Unrecognized element", reader.Name);
                }
            }

            return(result);
        }
Exemple #9
0
        private static InitializationLink ReadInitializationDocumentSection(XmlReader reader, WebsiteConfig config, WebsiteJob job)
        {
            var url        = reader.GetAttribute("url");
            var httpMethod = reader.GetAttribute <string>("method", System.Net.WebRequestMethods.Http.Get);

            IDictionary <string, string>         parameters = null;
            IDictionary <string, string>         headers    = null;
            IDictionary <string, ExtractionItem> extractionItemsOverride = new Dictionary <string, ExtractionItem>();

            while (!(reader.Name == "initialization" && reader.NodeType == XmlNodeType.EndElement) && reader.Read())
            {
                if (!reader.IsStartElement())
                {
                    continue;
                }

                switch (reader.Name)
                {
                case "parameters":
                    parameters = ReadExtractionLinkParametersSection(reader).ToDictionary(pred => pred.Key, pred => pred.Value.FormatString);
                    break;

                case "headers":
                    headers = ReadHttpHeadersSection(reader).ToDictionary(pred => pred.Key, pred => pred.Value.FormatString);
                    break;

                case "extraction":
                    extractionItemsOverride = ReadExtractionItemsSection(reader, config);
                    break;
                }
                //switch (reader.Name)
                //{
                //    case "frame":
                //        var extractionFrame = ReadExtractionFrameSection(reader, config);
                //        result.ExtractionItemsOverride.Add(extractionFrame.Name, extractionFrame);
                //        break;
                //    case "link":
                //        var extractionLink = ReadExtractionLinkSection(reader, config);
                //        result.ExtractionItemsOverride.Add(extractionLink.Name, extractionLink);
                //        break;
                //    case "item":
                //        var extractionItem = ReadExtractionItemSection(reader, config);
                //        result.ExtractionItemsOverride.Add(extractionItem.Name, extractionItem);
                //        break;
                //    default:
                //        throw new ArgumentException("Unrecognized element", reader.Name);
                //}
            }


            var result = new InitializationLink(
                url,
                httpMethod,
                parameters,
                headers,
                true,
                true,
                config,
                job
                );

            result.ExtractionItemsOverride = extractionItemsOverride;

            return(result);
        }