public WebCrawlerSourceNode(string[] rootUrls, XMLEntityModel entityModel, Func <string, string, IEnumerable <string> > discoverCrawlerUrls, CookieContainer cookieContainer = null)
        {
            _NextUrlQueue = new ConcurrentQueue <string>();
            foreach (string rootUrl in rootUrls)
            {
                _NextUrlQueue.Enqueue(rootUrl);
                VisitedUrls.Add(rootUrl);
            }

            EntityModel         = entityModel;
            DiscoverCrawlerUrls = discoverCrawlerUrls;
            _CookieContainer    = cookieContainer;
        }
Beispiel #2
0
 public static PipelineTask FromJsonFile(string path, XMLEntityModel model)
 {
     return(PipelineTask.Create(new TextFileSourceNode(path))
            .ParseJson(Entity.DefaultColumn, model));
 }
Beispiel #3
0
 public static PipelineTask FromJsonWeb(string url, XMLEntityModel model, CookieContainer container = null)
 {
     return(PipelineTask.Create(new WebSourceNode(url, container))
            .ParseJson(Entity.DefaultColumn, model));
 }
Beispiel #4
0
 public PipelineTask ParseHtml(XMLEntityModel model)
 {
     return(ParseHtml(Entity.DefaultColumn, model));
 }
Beispiel #5
0
 public PipelineTask ParseHtml(string column, XMLEntityModel model)
 {
     return(AddProcessNode((node) => new ParseHtmlProcessNode(node, column, model)));
 }
Beispiel #6
0
 public PipelineTask ParseJson(string targetColumn, XMLEntityModel model)
 {
     return(AddProcessNode((node) => new ParseJsonProcessNode(node, targetColumn, model)));
 }
Beispiel #7
0
 public ParseXMLProcessNode(DataNode parent, string targetColumn, XMLEntityModel model)
     : base(parent)
 {
     TargetColumn = targetColumn;
     Model        = model;
 }
 public ParseJsonProcessNode(DataNode parent, string targetColumn, XMLEntityModel model)
     : base(parent, targetColumn, model)
 {
 }
 public WebCrawlerSourceNode(string[] rootUrls, XMLEntityModel entityModel, CookieContainer cookieContainer = null, params string[] urlPatterns)
     : this(rootUrls, entityModel, (content, url) => GetUrlsFromHtmlByPatterns(content, url, urlPatterns), cookieContainer)
 {
 }