private static WebsiteJob ReadWebsiteJobNode(XmlTextReader reader, WebsiteConfig config) { var job = new WebsiteJob(config); job.Name = reader.GetAttribute <string>("name", job.Name); var holderNodeName = reader.Name; while (!(reader.Name == holderNodeName && reader.NodeType == XmlNodeType.EndElement) && reader.Read()) { if (!reader.IsStartElement()) { continue; } switch (reader.Name) { case "initialization": job.InitializationDocumentLink = ReadInitializationDocumentSection(reader, config, job); break; case "entry": job.EntryLinks = ReadEntryLinksSection(reader, config, job); break; case "dictionary": job.PredefinedValues = ReadPredefinedValuesSection(reader, config); break; default: throw new ArgumentException("Unrecognized element", reader.Name); } } return(job); }
public ResourceLink( StringWithDependencies urlWithDependencies, string httpMethod, IDictionary <string, StringWithDependencies> parametersWithDependencies, IDictionary <string, StringWithDependencies> headersWithDependencies, WebsiteConfig config, WebsiteJob job, ResourceLink referrerResourceLink ) : this(null, httpMethod, null, null, config, job, referrerResourceLink) { UrlWithDependencies = urlWithDependencies; ParametersWithDependencies = parametersWithDependencies; HeadersWithDependencies = headersWithDependencies; }
public InitializationLink( StringWithDependencies urlWithDependencies, string httpMethod, IDictionary <string, StringWithDependencies> parametersWithDependencies, IDictionary <string, StringWithDependencies> headersWithDependencies, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies = null ) : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, extractLinks, extractData, config, job, preExtractedItemsWithDependencies, null) { }
public InitializationLink( string url, string httpMethod, IDictionary <string, string> parameters, IDictionary <string, string> headers, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, string> preExtractedItems = null ) : base(url, httpMethod, parameters, headers, extractLinks, extractData, config, job, preExtractedItems, null) { }
public DocumentLink( StringWithDependencies urlWithDependencies, string httpMethod, IDictionary <string, StringWithDependencies> parametersWithDependencies, IDictionary <string, StringWithDependencies> headersWithDependencies, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, StringWithDependencies> preExtractedItemsWithDependencies, DocumentLink referrerDocumentLink = null ) : base(urlWithDependencies, httpMethod, parametersWithDependencies, headersWithDependencies, config, job, referrerDocumentLink) { ExtractLinks = extractLinks; ExtractData = extractData; PreExtractedItemsWithDependencies = preExtractedItemsWithDependencies; }
public ResourceLink( string url, string httpMethod, IDictionary <string, string> parameters, IDictionary <string, string> headers, WebsiteConfig config, WebsiteJob job, ResourceLink referrerResourceLink ) { Url = url; HttpMethod = httpMethod; Config = config; Job = job; Parameters = parameters; Headers = headers; ReferrerResourceLink = referrerResourceLink; UserAgent = referrerResourceLink?.UserAgent ?? config.CrawlingSettings.UserAgents.GetRandomElement(); }
public DocumentLink ( string url, string httpMethod, IDictionary <string, string> parameters, IDictionary <string, string> headers, bool extractLinks, bool extractData, WebsiteConfig config, WebsiteJob job, CollectionDictionary <string, string> preExtractedItems, DocumentLink referrerDocumentLink = null ) : base(url, httpMethod, parameters, headers, config, job, referrerDocumentLink) { ExtractLinks = extractLinks; ExtractData = extractData; PreExtractedItems = preExtractedItems; }
private static IList <ResourceLink> ReadEntryLinksSection(XmlReader reader, WebsiteConfig config, WebsiteJob job) { var result = new List <ResourceLink>(); while (!(reader.Name == "entry" && reader.NodeType == XmlNodeType.EndElement) && reader.Read()) { if (!reader.IsStartElement()) { continue; } switch (reader.Name) { case "link": var extractionLink = ReadExtractionLinkSection(reader, config); var linkExtractedItems = new CollectionDictionary <string, StringWithDependencies>(); foreach (var extractionItem in extractionLink.PredefinedExtractionItems.Values) { linkExtractedItems.AddValue(extractionItem.Name, extractionItem.Value); } var dependencyDataSource = new DependencyDataSource( linkExtractedItems .Where(pred => pred.Value.Any(value => !value.RequiresResolve)) .ToCollectionDictionary( pred => pred.Key, pred => pred.Value .Where(value => !value.RequiresResolve) .Select(sel => sel.FormatString) ), config.PredefinedValues, job.PredefinedValues ); // NOTE: These are entry links, so they can't have any location to extract items from, only constant values var extractedLink = new AutoDetectLink( dependencyDataSource.Resolve(extractionLink.Value), extractionLink.HttpMethod, dependencyDataSource.Resolve(extractionLink.Parameters), dependencyDataSource.Resolve(extractionLink.Headers), extractionLink.ExtractLinks, extractionLink.ExtractData, config, job, dependencyDataSource.Resolve(linkExtractedItems), (job as CrawlingBase ?? config)?.InitializationDocumentLink ); result.Add(extractedLink); break; default: throw new ArgumentException("Unrecognized element", reader.Name); } } return(result); }
private static InitializationLink ReadInitializationDocumentSection(XmlReader reader, WebsiteConfig config, WebsiteJob job) { var url = reader.GetAttribute("url"); var httpMethod = reader.GetAttribute <string>("method", System.Net.WebRequestMethods.Http.Get); IDictionary <string, string> parameters = null; IDictionary <string, string> headers = null; IDictionary <string, ExtractionItem> extractionItemsOverride = new Dictionary <string, ExtractionItem>(); while (!(reader.Name == "initialization" && reader.NodeType == XmlNodeType.EndElement) && reader.Read()) { if (!reader.IsStartElement()) { continue; } switch (reader.Name) { case "parameters": parameters = ReadExtractionLinkParametersSection(reader).ToDictionary(pred => pred.Key, pred => pred.Value.FormatString); break; case "headers": headers = ReadHttpHeadersSection(reader).ToDictionary(pred => pred.Key, pred => pred.Value.FormatString); break; case "extraction": extractionItemsOverride = ReadExtractionItemsSection(reader, config); break; } //switch (reader.Name) //{ // case "frame": // var extractionFrame = ReadExtractionFrameSection(reader, config); // result.ExtractionItemsOverride.Add(extractionFrame.Name, extractionFrame); // break; // case "link": // var extractionLink = ReadExtractionLinkSection(reader, config); // result.ExtractionItemsOverride.Add(extractionLink.Name, extractionLink); // break; // case "item": // var extractionItem = ReadExtractionItemSection(reader, config); // result.ExtractionItemsOverride.Add(extractionItem.Name, extractionItem); // break; // default: // throw new ArgumentException("Unrecognized element", reader.Name); //} } var result = new InitializationLink( url, httpMethod, parameters, headers, true, true, config, job ); result.ExtractionItemsOverride = extractionItemsOverride; return(result); }