예제 #1
0
        public void Process(Page page)
        {
            foreach (IEntityExtractor pageModelExtractor in EntityExtractorList)
            {
                dynamic process = pageModelExtractor.Process(page);

                if (process == null || (process is IEnumerable && !((IEnumerable)process).GetEnumerator().MoveNext()))
                {
                    continue;
                }

                page.AddResultItem(pageModelExtractor.EntityName, process);
            }

            if (!page.MissTargetUrls)
            {
                if (GetCustomizeTargetUrls == null)
                {
                    ExtractLinks(page, TargetUrlExtractInfos);
                }
                else
                {
                    page.AddTargetRequests(GetCustomizeTargetUrls(page));
                }
            }

            if (page.ResultItems.Results.Count == 0)
            {
                page.ResultItems.IsSkip = true;
            }
        }
예제 #2
0
        public Page Download(Request request, ITask task)
        {
            // ReSharper disable once UnusedVariable
            string path = BasePath + "/" + task.Identify + "/";
            Page page;
            try
            {
                FileInfo file = GetFile(path + Encrypt.Md5Encrypt(request.Url));

                StreamReader bufferedReader = new StreamReader(file.OpenRead());
                string line = bufferedReader.ReadLine();
                if (("url:\t" + request.Url).Equals(line))
                {
                    string html = GetHtml(bufferedReader);
                    page = new Page(request);
                    page.SetUrl(PlainText.Create(request.Url));
                    page.SetHtml(Html.Create(html));
                }
            }
            catch (IOException e)
            {
                if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException)))
                {
                    _logger.Info("File not exist for url " + request.Url);
                }
                else
                {
                    _logger.Warn("File read error for url " + request.Url, e);
                }
            }
            page = DownloadWhenMiss(request, task);
            return page;
        }
 public void Process(Page page)
 {
     foreach (ExtractRule extractRule in _extractRules)
     {
         if (extractRule.IsMulti)
         {
             IList<string> results = page.GetHtml().SelectDocumentForList(extractRule.Selector);
             if (extractRule.IsNotNull && results.Count == 0)
             {
                 page.SetSkip(true);
             }
             else
             {
                 page.GetResultItems().Put(extractRule.FieldName, results);
             }
         }
         else
         {
             string result = page.GetHtml().SelectDocument(extractRule.Selector);
             if (extractRule.IsNotNull && result == null)
             {
                 page.SetSkip(true);
             }
             else
             {
                 page.GetResultItems().Put(extractRule.FieldName, result);
             }
         }
     }
 }
		public void Process(Page page)
		{
			IList<String> links = page.GetHtml().Links().Regex("http://my\\.oschina\\.net/flashsword/blog/\\d+").GetAll();
			page.AddTargetRequests(links);
			page.PutField("title", page.GetHtml().XPath("//div[@class='BlogEntity']/div[@class='BlogTitle']/h1/text()").ToString());
			page.PutField("content", page.GetHtml().XPath("//div[@class='BlogContent']/tidyText()").ToString());
			page.PutField("tags", page.GetHtml().XPath("//div[@class='BlogTags']/a/text()").GetAll());
			page.PutField("artical", page.GetHtml().XPath("//*[@Class='Blog']/div[1]/div/h2/a").ToString());
		}
예제 #5
0
        public Page Download(Request request, ISpider spider)
        {
            Page page = new Page(request, spider.Site.ContentType);
            page.Content = File.ReadAllText(request.Url.LocalPath);
            page.TargetUrl = request.Url.ToString();
            page.Url = request.Url.ToString();
            page.StatusCode = 200;

            return page;
        }
예제 #6
0
 public void Process(Page page)
 {
     foreach (PageModelExtractor pageModelExtractor in _pageModelExtractorList)
     {
         ExtractLinks(page, pageModelExtractor.GetHelpUrlRegionSelector(), pageModelExtractor.GetHelpUrlPatterns());
         ExtractLinks(page, pageModelExtractor.GetTargetUrlRegionSelector(), pageModelExtractor.GetTargetUrlPatterns());
         object process = pageModelExtractor.Process(page);
         if (process == null || (process is IList && ((IList)process).Count == 0))
         {
             continue;
         }
         PostProcessPageModel(process);
         page.PutField(pageModelExtractor.GetModelType().FullName, process);
     }
     if (page.GetResultItems().GetAll().Count == 0)
     {
         page.GetResultItems().IsSkip = true;
     }
 }
예제 #7
0
        public void Process(Page page)
        {
            if (_language == Language.Javascript)
            {
                //engine.eval(defines + "\n" + script, context);
                //                        NativeObject o = (NativeObject) engine.get("result");
                //                        if (o != null) {
                //                            for (Object o1 : o.getIds()) {
                //                                string key = string.valueOf(o1);
                //                                page.getResultItems().put(key, NativeObject.getProperty(o, key));
                //                            }
                //                        }
                string realScript = _defines + Environment.NewLine + _script;

                Jurassic.ScriptEngine engine = new Jurassic.ScriptEngine();
                engine.EnableExposedClrTypes = true;
                engine.SetGlobalValue("page", page);
                engine.SetGlobalValue("config", Site);
                engine.Execute(realScript);
            }
            else if (_language == Language.Python)
            {
                //RubyHash oRuby = (RubyHash)engine.eval(defines + "\n" + script, context);
                //Iterator itruby = oRuby.entrySet().iterator();
                //while (itruby.hasNext())
                //{
                //	Map.Entry pairs = (Map.Entry)itruby.next();
                //	page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
                //}
            }
            else if (_language == Language.Ruby)
            {
                //engine.eval(defines + "\n" + script, context);
                //PyDictionary oJython = (PyDictionary)engine.get("result");
                //Iterator it = oJython.entrySet().iterator();
                //while (it.hasNext())
                //{
                //	Map.Entry pairs = (Map.Entry)it.next();
                //	page.getResultItems().put(pairs.getKey().toString(), pairs.getValue());
                //}
            }
        }
예제 #8
0
        public Page Download(Request request, ISpider spider)
        {
            // ReSharper disable once UnusedVariable
            string path = BasePath + "/" + spider.Identity + "/";
            Page page;
            try
            {
                FileInfo file = PrepareFile(path + Encrypt.Md5Encrypt(request.Url.ToString()));

                StreamReader bufferedReader = new StreamReader(file.OpenRead());
                string line = bufferedReader.ReadLine();
                if (("url:\t" + request.Url).Equals(line))
                {
                    string html = GetHtml(bufferedReader);
                    page = new Page(request, spider.Site.ContentType);
                    page.Url = request.Url.ToString();
                    page.Content = html;
                }
            }
            catch (IOException e)
            {
            #if !NET_CORE
                if (e.GetType().IsInstanceOfType(typeof(FileNotFoundException)))
            #else
                if (typeof(FileNotFoundException).GetTypeInfo().IsAssignableFrom(e.GetType().GetTypeInfo()))
            #endif
                {
                    spider.Logger.Info("File not exist for url: " + request.Url);
                }
                else
                {
                    spider.Logger.Warn("File read error for url " + request.Url, e);
                }
            }
            page = DownloadWhenMiss(request, spider);
            return page;
        }
 public abstract bool CanStop(Page page);
예제 #10
0
 public void AfterProcess(Page page)
 {
 }
예제 #11
0
 public Page Download(Request request, ISpider spider)
 {
     var page = new Page(request, ContentType.Html);
     page.Content = "";
     return page;
 }
예제 #12
0
 public void Process(Page page)
 {
     page.IsSkip = true;
 }
예제 #13
0
 public abstract MatchOther ProcessPage(Page page);
예제 #14
0
        private JObject ProcessSingle(Page page, ISelectable item, EntityMetadata entityDefine, int index)
        {
            JObject dataObject = new JObject();

            foreach (var field in entityDefine.Entity.Fields)
            {
                var fieldValue = ExtractField(item, page, field, index);
                if (fieldValue != null)
                {
                    dataObject.Add(field.Name, fieldValue);
                }
            }

            var stopping = entityDefine.Stopping;

            if (stopping != null)
            {
                var field = entityDefine.Entity.Fields.First(f => f.Name == stopping.PropertyName) as Field;
                if (field != null)
                {
                    var datatype = field.DataType;
                    bool isEntity = VerifyIfEntity(datatype);
                    if (isEntity)
                    {
                        throw new SpiderExceptoin("Can't compare with object.");
                    }
                    stopping.DataType = datatype.ToString().ToLower();
                    string value = dataObject.SelectToken($"$.{stopping.PropertyName}")?.ToString();
                    if (string.IsNullOrEmpty(value))
                    {
                        page.MissTargetUrls = true;
                    }
                    else
                    {
                        if (stopping.NeedStop(value))
                        {
                            page.MissTargetUrls = true;
                        }
                    }
                }
                else
                {
                    throw new SpiderExceptoin("Stopping cannot be EntityMetaData.");
                }
            }

            return dataObject.Children().Count() > 0 ? dataObject : null;
        }
예제 #15
0
        /// <summary>
        /// ����Ҳ����򲻷���URL, ��Ȼ���ص�URL̫��
        /// </summary>
        /// <param name="page"></param>
        /// <param name="urlRegionSelector"></param>
        /// <param name="urlPatterns"></param>
        private void ExtractLinks(Page page, ISelector urlRegionSelector, IList<Regex> urlPatterns)
        {
            var links = urlRegionSelector == null ? new List<string>() : page.GetHtml().SelectList(urlRegionSelector).Links().GetAll();

            if (urlPatterns == null || urlPatterns.Count == 0)
            {
                page.AddTargetRequests(links);
                return;
            }

            foreach (Regex targetUrlPattern in urlPatterns)
            {
                foreach (string link in links)
                {
                    if (targetUrlPattern.IsMatch(link))
                    {
                        page.AddTargetRequest(new Request(link, page.GetRequest().Extras));
                    }
                }
            }
        }
        public override Page Download(Request request, ISpider spider)
        {
            WebDriverItem driverService = null;

            try
            {
                driverService = Pool.Get();

                lock (this)
                {
                    if (!_isLogined && Login != null)
                    {
                        _isLogined = Login.Invoke(driverService.WebDriver as RemoteWebDriver);
                        if (!_isLogined)
                        {
                            throw new SpiderExceptoin("Login failed. Please check your login codes.");
                        }
                    }
                }

                //中文乱码URL
                Uri uri = request.Url;
                string query = uri.Query;
                string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + (uri.Port == 80 ? "" : (":" + uri.Port)) + uri.AbsolutePath + (string.IsNullOrEmpty(query)
                                    ? ""
                                    : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))));

                if (UrlFormat != null)
                {
                    realUrl = UrlFormat(realUrl);
                }

                RedialManagerUtils.Execute("webdriverdownloader-download", () =>
                {
                    driverService.WebDriver.Navigate().GoToUrl(realUrl);
                });

                Thread.Sleep(_webDriverWaitTime);

                AfterNavigate?.Invoke((RemoteWebDriver)driverService.WebDriver);

                Page page = new Page(request, spider.Site.ContentType);
                page.Content = driverService.WebDriver.PageSource;
                page.Url = request.Url.ToString();
                page.TargetUrl = driverService.WebDriver.Url;
                page.Title = driverService.WebDriver.Title;

                ValidatePage(page, spider);

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                return page;
            }
            catch (Exception e)
            {
                if (e.Message == "Need Verify Code.")
                {
                    VerifyCode?.Invoke(driverService.WebDriver as RemoteWebDriver);
                }

                throw e;
            }
            finally
            {
                Pool.ReturnToPool(driverService);
            }
        }
예제 #17
0
 public void Process(Page page)
 {
     _pageProcessor.Process(page);
 }
예제 #18
0
        private dynamic ExtractField(ISelectable item, Page page, DataToken field, int index)
        {
            ISelector selector = SelectorUtil.Parse(field.Selector);
            if (selector == null)
            {
                return null;
            }

            var f = field as Field;
            List<Formatter.Formatter> formatters = GenerateFormatter(f?.Formatters);

            bool isEntity = field is Entity;

            if (!isEntity)
            {
                string tmpValue;
                if (selector is EnviromentSelector)
                {
                    var enviromentSelector = selector as EnviromentSelector;
                    tmpValue = GetEnviromentValue(enviromentSelector.Field, page, index);
                    foreach (var formatter in formatters)
                    {
                        tmpValue = formatter.Formate(tmpValue);
                    }
                    return tmpValue;
                }
                else
                {
                    bool needPlainText = (((Field)field).Option == PropertyExtractBy.ValueOption.PlainText);
                    if (field.Multi)
                    {
                        var propertyValues = item.SelectList(selector).Nodes();

                        List<string> results = new List<string>();
                        foreach (var propertyValue in propertyValues)
                        {
                            string tmp = propertyValue.GetValue(needPlainText);
                            foreach (var formatter in formatters)
                            {
                                tmp = formatter.Formate(tmp);
                            }
                            results.Add(tmp);
                        }
                        return new JArray(results);
                    }
                    else
                    {
                        bool needCount = (((Field)field).Option == PropertyExtractBy.ValueOption.Count);
                        if (needCount)
                        {
                            var propertyValues = item.SelectList(selector).Nodes();
                            return propertyValues != null ? propertyValues.Count.ToString() : "-1";
                        }
                        else
                        {
                            tmpValue = item.Select(selector)?.GetValue(needPlainText);
                            tmpValue = formatters.Aggregate(tmpValue, (current, formatter) => formatter.Formate(current));
                            return tmpValue;
                        }
                    }
                }
            }
            else
            {
                if (field.Multi)
                {
                    var propertyValues = item.SelectList(selector).Nodes();
                    JArray objs = new JArray();
                    var selectables = item.SelectList(selector).Nodes();
                    foreach (var selectable in selectables)
                    {
                        JObject obj = new JObject();

                        foreach (var child in ((Entity)field).Fields)
                        {
                            obj.Add(child.Name, ExtractField(selectable, page, child, 0));
                        }
                        objs.Add(obj);
                    }
                    return objs;
                }
                else
                {
                    JObject obj = new JObject();
                    var selectable = item.Select(selector);
                    foreach (var child in ((Entity)field).Fields)
                    {
                        obj.Add(child.Name, ExtractField(selectable, page, field, 0));
                    }
                    return obj;
                }
            }
        }
예제 #19
0
        public dynamic Process(Page page)
        {
            if (_enviromentValues != null && _enviromentValues.Count > 0)
            {
                foreach (var enviromentValue in _enviromentValues)
                {
                    string name = enviromentValue.Name;
                    var value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue.Selector)).GetValue();
                    page.Request.PutExtra(name, value);
                }
            }
            bool isMulti = false;
            ISelector selector = SelectorUtil.Parse(_entityDefine.Selector);

            if (selector == null)
            {
                isMulti = false;
            }
            else
            {
                isMulti = _entityDefine.Multi;
            }
            if (isMulti)
            {
                var list = page.Selectable.SelectList(selector).Nodes();
                if (list == null || list.Count == 0)
                {
                    return null;
                }
                var countToken = _entityDefine.Limit;
                if (countToken != null)
                {
                    list = list.Take(countToken.Value).ToList();
                }

                List<JObject> result = new List<JObject>();
                int index = 0;
                foreach (var item in list)
                {
                    JObject obj = ProcessSingle(page, item, _entityDefine, index);
                    if (obj != null)
                    {
                        result.Add(obj);
                    }
                    index++;
                }
                return result;
            }
            else
            {
                ISelectable select;
                if (selector == null)
                {
                    select = page.Selectable;
                }
                else
                {
                    select = page.Selectable.Select(selector);
                    if (select == null)
                    {
                        return null;
                    }
                }

                return ProcessSingle(page, select, _entityDefine, 0);
            }
        }
예제 #20
0
 public void Process(Page page)
 {
     if ((from subPageProcessor in _subPageProcessors where subPageProcessor.Match(page.GetRequest()) select subPageProcessor.ProcessPage(page)).Any(matchOtherProcessorProcessor => matchOtherProcessorProcessor != MatchOther.Yes))
     {
     }
 }
예제 #21
0
        /// <summary>
        /// 如果找不到则不返回URL, 不然返回的URL太多
        /// </summary>
        /// <param name="page"></param>
        /// <param name="targetUrlExtractInfos"></param>
        private void ExtractLinks(Page page, List<Model.TargetUrlExtractor> targetUrlExtractInfos)
        {
            if (targetUrlExtractInfos == null)
            {
                return;
            }

            foreach (var targetUrlExtractInfo in targetUrlExtractInfos)
            {
                var urlRegionSelector = targetUrlExtractInfo.Region;
                var formatters = targetUrlExtractInfo.Formatters;
                var urlPatterns = targetUrlExtractInfo.Patterns;

                var links = urlRegionSelector == null ? page.Selectable.Links().GetValues() : (page.Selectable.SelectList(urlRegionSelector)).Links().GetValues();
                if (links == null)
                {
                    return;
                }

                // check: 仔细考虑是放在前面, 还是在后面做 formatter, 我倾向于在前面. 对targetUrl做formatter则表示Start Url也应该是要符合这个规则的。
                if (formatters != null && formatters.Count > 0)
                {
                    List<string> tmp = new List<string>();
                    foreach (string link in links)
                    {
                        var url = new String(link.ToCharArray());
                        foreach (Formatter f in formatters)
                        {
                            url = f.Formate(url);
                        }
                        tmp.Add(url);
                    }
                    links = tmp;
                }

                List<string> tmpLinks = new List<string>();
                foreach (var link in links)
                {
            #if !NET_CORE
                    tmpLinks.Add(HttpUtility.HtmlDecode(HttpUtility.UrlDecode(link)));
            #else
                    tmpLinks.Add(WebUtility.HtmlDecode(WebUtility.UrlDecode(link)));
            #endif
                }
                links = tmpLinks;

                if (urlPatterns == null || urlPatterns.Count == 0)
                {
                    page.AddTargetRequests(links);
                    return;
                }

                foreach (Regex targetUrlPattern in urlPatterns)
                {
                    foreach (string link in links)
                    {
                        if (targetUrlPattern.IsMatch(link))
                        {
                            page.AddTargetRequest(new Request(link, page.Request.NextDepth, page.Request.Extras)
                            {
                            });
                        }
                    }
                }
            }
        }
예제 #22
0
        public override Page Download(Request request, ITask task)
        {
            CheckInit();
            Site site = task.Site;
            IWebDriver webDriver = null;
            try
            {
                webDriver = _webDriverPool.Get();

                IOptions manage = webDriver.Manage();
                if (site.GetCookies() != null)
                {
                    foreach (KeyValuePair<String, String> cookieEntry in site.GetCookies())
                    {
                        Cookie cookie = new Cookie(cookieEntry.Key, cookieEntry.Value);
                        manage.Cookies.AddCookie(cookie);
                    }
                }

                //Logger.Info("Downloading page " + request.Url);

                //中文乱码URL
                Uri uri = new Uri(request.Url);
                string query = uri.Query;
                string realUrl = uri.Scheme + "://" + uri.DnsSafeHost + uri.AbsolutePath +
                                 (string.IsNullOrEmpty(query)
                                     ? ""
                                     : ("?" + HttpUtility.UrlPathEncode(uri.Query.Substring(1, uri.Query.Length - 1))));

                webDriver.Navigate().GoToUrl(realUrl);

                string resultUrl = webDriver.Url;
                if (resultUrl.Contains("error") || resultUrl.Contains("login") || resultUrl.Contains("//www.tmall.com"))
                {
                    Logger.Error("Url error: " + realUrl);
                    _webDriverPool.Close(webDriver);
                    // throw exception without return this webdriver
                    throw new SpiderExceptoin("Browser request too much.");
                }

                Thread.Sleep(_webDriverWaitTime);

                //IWebElement webElement = webDriver.FindElement(By.XPath("/html"));
                //String content = webElement.GetAttribute("outerHTML");

                Page page = new Page(request);
                page.SetRawText(webDriver.PageSource);
                page.SetUrl(new PlainText(request.Url));

                // 结束后要置空, 这个值存到Redis会导置无限循环跑单个任务
                request.PutExtra(Request.CycleTriedTimes, null);

                return page;
            }
            catch (Exception e)
            {
                if (site.CycleRetryTimes > 0)
                {
                    return AddToCycleRetry(request, site);
                }
                OnError(request, e);
                return null;
            }
            finally
            {
                _webDriverPool.ReturnToPool(webDriver);
            }
        }
예제 #23
0
        private string GetEnviromentValue(string field, Page page, int index)
        {
            if (field.ToLower() == "url")
            {
                return page.Url;
            }

            if (field.ToLower() == "targeturl")
            {
                return page.TargetUrl;
            }

            if (field.ToLower() == "now")
            {
                return DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss");
            }

            if (field.ToLower() == "monday")
            {
                return DateTimeUtils.MONDAY_RUN_ID;
            }

            if (field.ToLower() == "today")
            {
                return DateTimeUtils.TODAY_RUN_ID;
            }

            if (field.ToLower() == "index")
            {
                return index.ToString();
            }

            if (!page.Request.ExistExtra(field))
            {
                return field;
            }
            else
            {
                return page.Request.GetExtra(field)?.ToString();
            }
        }