예제 #1
0
        public virtual List <JObject> Extract(Page page)
        {
            List <JObject> result = new List <JObject>();

            if (GlobalValues != null && GlobalValues.Count > 0)
            {
                foreach (var enviromentValue in GlobalValues)
                {
                    string name  = enviromentValue.Name;
                    var    value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue)).GetValue();
                    page.Request.PutExtra(name, value);
                }
            }
            ISelector selector = SelectorUtil.Parse(EntityMetadata.Selector);

            if (selector != null && EntityMetadata.Multi)
            {
                var list = page.Selectable.SelectList(selector).Nodes();
                if (list == null || list.Count == 0)
                {
                    result = null;
                }
                else
                {
                    if (EntityMetadata.Take > 0)
                    {
                        list = list.Take(EntityMetadata.Take).ToList();
                    }

                    int index = 0;
                    foreach (var item in list)
                    {
                        var obj = ExtractSingle(page, item, index);
                        if (obj != null)
                        {
                            result.Add(obj);
                        }
                        index++;
                    }
                }
            }
            else
            {
                ISelectable select = selector == null ? page.Selectable : page.Selectable.Select(selector);

                if (select != null)
                {
                    var singleResult = ExtractSingle(page, select, 0);
                    result = singleResult != null ? new List <JObject> {
                        singleResult
                    } : null;
                }
                else
                {
                    result = null;
                }
            };
            return(result);
        }
예제 #2
0
        /// <summary>
        /// 解析成爬虫实体对象
        /// </summary>
        /// <param name="page">页面数据</param>
        /// <returns>爬虫实体对象</returns>
        public List <T> Extract(Page page)
        {
            List <T> result = new List <T>();

            if (EntityDefine.SharedValues != null && EntityDefine.SharedValues.Count > 0)
            {
                foreach (var enviromentValue in EntityDefine.SharedValues)
                {
                    string name  = enviromentValue.Name;
                    var    value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue)).GetValue();
                    page.Request.PutExtra(name, value);
                }
            }
            ISelector selector = SelectorUtil.Parse(EntityDefine.Selector);

            if (selector != null && EntityDefine.Multi)
            {
                var list = page.Selectable.SelectList(selector).Nodes();
                if (list == null || list.Count == 0)
                {
                    result = null;
                }
                else
                {
                    if (EntityDefine.Take > 0)
                    {
                        list = list.Take(EntityDefine.Take).ToList();
                    }

                    for (int i = 0; i < list.Count; ++i)
                    {
                        var item = list[i];
                        var obj  = ExtractSingle(page, item, i);
                        if (obj != null)
                        {
                            result.Add(obj);
                        }
                    }
                }
            }
            else
            {
                ISelectable select = selector == null ? page.Selectable : page.Selectable.Select(selector);

                if (select != null)
                {
                    var item = ExtractSingle(page, select, 0);
                    result = item != null ? new List <T> {
                        item
                    } : null;
                }
                else
                {
                    result = null;
                }
            }
            return(result);
        }
예제 #3
0
        public bool NeedStop(Page page, BaseTargetUrlsCreator creator)
        {
            var tmps = page.Selectable.SelectList(SelectorUtil.Parse(TimeSelector)).GetValues();

            if (tmps == null)
            {
                return(true);
            }

            List <string> timeStrings = new List <string>();

            foreach (var c in tmps)
            {
                var s = c;
                if (TimeFormatters != null)
                {
                    foreach (var formatter in TimeFormatters)
                    {
                        s = formatter.Formate(s);
                    }
                }
                timeStrings.Add(s);
            }

            foreach (var c in timeStrings)
            {
                var dt = DateTime.Parse(c);
                if (IsBefore)
                {
                    foreach (var stopper in Times)
                    {
                        var stopDate = DateTime.Parse(stopper);
                        if (dt < stopDate)
                        {
                            return(true);
                        }
                    }
                }
                else
                {
                    foreach (var stopper in Times)
                    {
                        var stopDate = DateTime.Parse(stopper);
                        if (dt > stopDate)
                        {
                            return(true);
                        }
                    }
                }
            }
            return(false);
        }
예제 #4
0
        public override bool CanStop(Page page)
        {
            var current = page.Selectable.SelectList(SelectorUtil.Parse(CurrenctPageSelector)).GetValues();

            if (current == null)
            {
                return(true);
            }

            List <string> timeStrings = new List <string>();

            foreach (var c in current)
            {
                var s = c;
                if (CurrenctPageFormatters != null)
                {
                    foreach (var formatter in CurrenctPageFormatters)
                    {
                        s = formatter.Formate(s);
                    }
                }
                timeStrings.Add(s);
            }

            foreach (var c in timeStrings)
            {
                var dt = DateTime.Parse(c.ToString());
                if (IsBefore)
                {
                    foreach (var stopper in Stoppers)
                    {
                        var stopDate = DateTime.Parse(stopper);
                        if (dt < stopDate)
                        {
                            return(true);
                        }
                    }
                }
                else
                {
                    foreach (var stopper in Stoppers)
                    {
                        var stopDate = DateTime.Parse(stopper);
                        if (dt > stopDate)
                        {
                            return(true);
                        }
                    }
                }
            }
            return(false);
        }
예제 #5
0
        internal Model.TargetUrlExtractor GetTargetUrlExtractInfo()
        {
            var t = new Model.TargetUrlExtractor
            {
                Formatters = Formatters,
                Region     = SelectorUtil.Parse(Region)
            };

            foreach (var p in Patterns)
            {
                if (!string.IsNullOrEmpty(p?.Trim()))
                {
                    t.Patterns.Add(new Regex(p));
                }
            }
            return(t);
        }
예제 #6
0
        public bool NeedStop(Page page, BaseTargetUrlsCreator creator)
        {
            int totalPage = -2000;

            if (TotalPageSelector != null)
            {
                string totalStr = page.Selectable.Select(SelectorUtil.Parse(TotalPageSelector)).GetValue();
                if (TotalPageFormatters != null)
                {
                    foreach (var formatter in TotalPageFormatters)
                    {
                        totalStr = formatter.Formate(totalStr);
                    }
                }
                if (!string.IsNullOrEmpty(totalStr))
                {
                    totalPage = int.Parse(totalStr);
                }
            }
            int currentPage = -1000;

            if (CurrenctPageSelector != null)
            {
                string currentStr = page.Selectable.Select(SelectorUtil.Parse(CurrenctPageSelector)).GetValue();
                if (CurrnetPageFormatters != null)
                {
                    foreach (var formatter in CurrnetPageFormatters)
                    {
                        currentStr = formatter.Formate(currentStr);
                    }
                }
                if (!string.IsNullOrEmpty(currentStr))
                {
                    currentPage = int.Parse(currentStr);
                }
            }
            if (currentPage == totalPage)
            {
                return(true);
            }
            return(false);
        }
예제 #7
0
        public int?Interval(Page page)
        {
            var intervalStr = page.Selectable.Select(SelectorUtil.Parse(Selector)).GetValue();

            if (!string.IsNullOrEmpty(intervalStr))
            {
                if (IntervalFormatters != null)
                {
                    foreach (var formatter in IntervalFormatters)
                    {
                        intervalStr = formatter.Formate(intervalStr);
                    }
                }
                if (!string.IsNullOrEmpty(intervalStr))
                {
                    return(int.Parse(intervalStr));
                }
            }
            return(null);
        }
예제 #8
0
        private dynamic ExtractField(ISelectable item, Page page, Field field, int index)
        {
            if (field == null)
            {
                return(null);
            }
            ISelector selector = SelectorUtil.Parse(field.Selector);

            if (selector == null)
            {
                return(null);
            }

            string tmpValue;

            if (selector is EnviromentSelector)
            {
                var enviromentSelector = selector as EnviromentSelector;
                tmpValue = GetEnviromentValue(enviromentSelector.Field, page, index);
                foreach (var formatter in field.Formatters)
                {
                    tmpValue = formatter.Formate(tmpValue);
                }
                return(tmpValue);
            }
            else
            {
                bool needPlainText = field.Option == PropertyDefine.Options.PlainText;
                if (field.Multi)
                {
                    var propertyValues = item.SelectList(selector).Nodes();

                    List <string> results = new List <string>();
                    foreach (var propertyValue in propertyValues)
                    {
                        results.Add(propertyValue.GetValue(needPlainText));
                    }
                    foreach (var formatter in field.Formatters)
                    {
                        results = formatter.Formate(results);
                    }
                    return(new JArray(results));
                }
                else
                {
                    bool needCount = field.Option == PropertyDefine.Options.Count;
                    if (needCount)
                    {
                        var    propertyValues = item.SelectList(selector).Nodes();
                        string count          = propertyValues?.Count.ToString();
                        count = string.IsNullOrEmpty(count) ? "-1" : count;
                        return(count);
                    }
                    else
                    {
                        tmpValue = item.Select(selector)?.GetValue(needPlainText);
                        foreach (var formatter in field.Formatters)
                        {
                            tmpValue = formatter.Formate(tmpValue);
                        }
                        return(tmpValue);
                    }
                }
            }
        }
예제 #9
0
        private object ExtractField(ISelectable item, Page page, Column field, int index)
        {
            if (field == null)
            {
                return(null);
            }
            ISelector selector = SelectorUtil.Parse(field.Selector);

            if (selector == null)
            {
                return(null);
            }

            if (selector is EnviromentSelector)
            {
                var enviromentSelector = selector as EnviromentSelector;
                var value = SelectorUtil.GetEnviromentValue(enviromentSelector.Field, page, index);
                foreach (var formatter in field.Formatters)
                {
#if DEBUG
                    try
                    {
#endif
                    value = formatter.Formate(value);
#if DEBUG
                }
                catch (Exception e)
                {
                    Logger.NLog(e.ToString(), Level.Error);
                }
#endif
                }
                return(TryConvert(value, field.DataType));
            }
            else
            {
                bool needCount = field.Option == PropertyDefine.Options.Count;
                if (needCount)
                {
                    var values = item.SelectList(selector).Nodes();
                    return(values.Count);
                }
                else
                {
                    var value = (object)item.Select(selector)?.GetValue(field.Option == PropertyDefine.Options.PlainText);

                    foreach (var formatter in field.Formatters)
                    {
#if DEBUG
                        try
                        {
#endif
                        value = formatter.Formate(value);
#if DEBUG
                    }
                    catch (Exception e)
                    {
                        Logger.NLog(e.ToString(), Level.Error);
                    }
#endif
                    }

                    return(TryConvert(value, field.DataType));
                }
            }
        }
예제 #10
0
        public bool NeedStop(Page page, BaseTargetUrlsCreator creator)
        {
            int totalPage = -2000;

            if (TotalPageSelector != null)
            {
                string totalStr = string.Empty;
                if (TotalPageSelector.Type == SelectorType.Enviroment)
                {
                    var selector = SelectorUtil.Parse(TotalPageSelector) as EnviromentSelector;
                    if (selector != null)
                    {
                        totalStr = EntityExtractor.GetEnviromentValue(selector.Field, page, 0);
                    }
                }
                else
                {
                    totalStr = page.Selectable.Select(SelectorUtil.Parse(TotalPageSelector)).GetValue();
                }

                if (!string.IsNullOrEmpty(totalStr))
                {
                    if (TotalPageFormatters != null)
                    {
                        foreach (var formatter in TotalPageFormatters)
                        {
                            totalStr = formatter.Formate(totalStr);
                        }
                    }
                    if (!string.IsNullOrEmpty(totalStr))
                    {
                        totalPage = int.Parse(totalStr);
                    }
                }
            }
            int currentPage = -1000;

            if (CurrenctPageSelector != null)
            {
                string currentStr = string.Empty;
                if (CurrenctPageSelector.Type == SelectorType.Enviroment)
                {
                    var selector = SelectorUtil.Parse(CurrenctPageSelector) as EnviromentSelector;
                    if (selector != null)
                    {
                        currentStr = EntityExtractor.GetEnviromentValue(selector.Field, page, 0);
                    }
                }
                else
                {
                    currentStr = page.Selectable.Select(SelectorUtil.Parse(CurrenctPageSelector)).GetValue();
                }

                if (!string.IsNullOrEmpty(currentStr))
                {
                    if (CurrnetPageFormatters != null)
                    {
                        foreach (var formatter in CurrnetPageFormatters)
                        {
                            currentStr = formatter.Formate(currentStr);
                        }
                    }
                    if (!string.IsNullOrEmpty(currentStr))
                    {
                        currentPage = int.Parse(currentStr);
                    }
                }
            }
            if (currentPage == totalPage)
            {
                return(true);
            }
            return(false);
        }
예제 #11
0
        private dynamic ExtractField(ISelectable item, Page page, DataToken field, int index)
        {
            ISelector selector = SelectorUtil.Parse(field.Selector);

            if (selector == null)
            {
                return(null);
            }

            var f = field as Field;

            bool isEntity = field is Entity;

            if (!isEntity)
            {
                string tmpValue;
                if (selector is EnviromentSelector)
                {
                    var enviromentSelector = selector as EnviromentSelector;
                    tmpValue = GetEnviromentValue(enviromentSelector.Field, page, index);
                    if (f != null)
                    {
                        foreach (var formatter in f.Formatters)
                        {
                            tmpValue = formatter.Formate(tmpValue);
                        }
                    }
                    return(tmpValue);
                }
                else
                {
                    bool needPlainText = ((Field)field).Option == PropertySelector.Options.PlainText;
                    if (field.Multi)
                    {
                        var propertyValues = item.SelectList(selector).Nodes();

                        List <string> results = new List <string>();
                        foreach (var propertyValue in propertyValues)
                        {
                            results.Add(propertyValue.GetValue(needPlainText));
                        }
                        if (f != null)
                        {
                            foreach (var formatter in f.Formatters)
                            {
                                results = formatter.Formate(results);
                            }
                        }
                        return(new JArray(results));
                    }
                    else
                    {
                        bool needCount = (((Field)field).Option == PropertySelector.Options.Count);
                        if (needCount)
                        {
                            var propertyValues = item.SelectList(selector).Nodes();
                            return(propertyValues?.Count.ToString() ?? "-1");
                        }
                        else
                        {
                            tmpValue = item.Select(selector)?.GetValue(needPlainText);
                            if (f != null)
                            {
                                foreach (var formatter in f.Formatters)
                                {
                                    tmpValue = formatter.Formate(tmpValue);
                                }
                            }
                            return(tmpValue);
                        }
                    }
                }
            }
            else
            {
                if (field.Multi)
                {
                    JArray objs        = new JArray();
                    var    selectables = item.SelectList(selector).Nodes();
                    foreach (var selectable in selectables)
                    {
                        JObject obj = new JObject();

                        foreach (var child in ((Entity)field).Fields)
                        {
                            obj.Add(child.Name, ExtractField(selectable, page, child, 0));
                        }
                        objs.Add(obj);
                    }
                    return(objs);
                }
                else
                {
                    JObject obj        = new JObject();
                    var     selectable = item.Select(selector);
                    foreach (var child in ((Entity)field).Fields)
                    {
                        obj.Add(child.Name, ExtractField(selectable, page, field, 0));
                    }
                    return(obj);
                }
            }
        }
예제 #12
0
        public List <JObject> Process(Page page)
        {
            List <JObject> result   = new List <JObject>();
            bool           isTarget = true;

            foreach (var targetUrlExtractor in EntityMetadata.TargetUrlExtractors)
            {
                foreach (var regex in targetUrlExtractor.Regexes)
                {
                    isTarget = regex.IsMatch(page.Url);
                    if (isTarget)
                    {
                        break;
                    }
                }
            }
            if (!isTarget)
            {
                return(null);
            }
            if (_globalValues != null && _globalValues.Count > 0)
            {
                foreach (var enviromentValue in _globalValues)
                {
                    string name  = enviromentValue.Name;
                    var    value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue)).GetValue();
                    page.Request.PutExtra(name, value);
                }
            }
            ISelector selector = SelectorUtil.Parse(EntityMetadata.Entity.Selector);

            if (selector != null && EntityMetadata.Entity.Multi)
            {
                var list = page.Selectable.SelectList(selector).Nodes();
                if (list == null || list.Count == 0)
                {
                    result = null;
                }
                else
                {
                    var countToken = EntityMetadata.Limit;
                    if (countToken != null)
                    {
                        list = list.Take(countToken.Value).ToList();
                    }

                    int index = 0;
                    foreach (var item in list)
                    {
                        JObject obj = ProcessSingle(page, item, index);
                        if (obj != null)
                        {
                            result.Add(obj);
                        }
                        index++;
                    }
                }
            }
            else
            {
                ISelectable select = selector == null ? page.Selectable : page.Selectable.Select(selector);

                if (select != null)
                {
                    var singleResult = ProcessSingle(page, select, 0);
                    result = new List <JObject> {
                        singleResult
                    };
                }
                else
                {
                    result = null;
                }
            }

            //if (EntityMetadata.TargetUrlsCreators != null && EntityMetadata.TargetUrlExtractors.Count > 0)
            //{
            //	foreach (var targetUrlsCreator in EntityMetadata.TargetUrlsCreators)
            //	{
            //		page.AddTargetRequests(targetUrlsCreator.Handle(page));
            //	}
            //}

            if (!page.MissExtractTargetUrls)
            {
                ExtractLinks(page, EntityMetadata.TargetUrlExtractors);
            }

            return(result);
        }
예제 #13
0
        public dynamic Process(Page page)
        {
            if (_enviromentValues != null && _enviromentValues.Count > 0)
            {
                foreach (var enviromentValue in _enviromentValues)
                {
                    string name  = enviromentValue.Name;
                    var    value = page.Selectable.Select(SelectorUtil.Parse(enviromentValue.Selector)).GetValue();
                    page.Request.PutExtra(name, value);
                }
            }
            bool      isMulti  = false;
            ISelector selector = SelectorUtil.Parse(_entityDefine.Selector);

            if (selector == null)
            {
                isMulti = false;
            }
            else
            {
                isMulti = _entityDefine.Multi;
            }
            if (isMulti)
            {
                var list = page.Selectable.SelectList(selector).Nodes();
                if (list == null || list.Count == 0)
                {
                    return(null);
                }
                var countToken = _entityDefine.Limit;
                if (countToken != null)
                {
                    list = list.Take(countToken.Value).ToList();
                }

                List <JObject> result = new List <JObject>();
                int            index  = 0;
                foreach (var item in list)
                {
                    JObject obj = ProcessSingle(page, item, _entityDefine, index);
                    if (obj != null)
                    {
                        result.Add(obj);
                    }
                    index++;
                }
                return(result);
            }
            else
            {
                ISelectable select;
                if (selector == null)
                {
                    select = page.Selectable;
                }
                else
                {
                    select = page.Selectable.Select(selector);
                    if (select == null)
                    {
                        return(null);
                    }
                }

                return(ProcessSingle(page, select, _entityDefine, 0));
            }
        }