Exemplo n.º 1
0
        public static ISiteCrawler Create(SiteParameter siteParameter)
        {
            if (siteParameter == null)
            {
                throw new ArgumentNullException(nameof(siteParameter));
            }

            string dataServiceName = GetValueOrDefault(siteParameter.CustomProcessors, "IDataService");
            string htmlReaderName  = GetValueOrDefault(siteParameter.CustomProcessors, "IHtmlReader");
            string pageParserName  = GetValueOrDefault(siteParameter.CustomProcessors, "IPageParser");
            string itemReaderName  = GetValueOrDefault(siteParameter.CustomProcessors, "IItemReader");
            string pageReaderName  = GetValueOrDefault(siteParameter.CustomProcessors, "IPageReader");

            IHtmlReader       htmlReader          = Container.Resolve <IHtmlReader>(htmlReaderName);
            ParameterOverride htmlReaderParameter = new ParameterOverride("htmlReader", htmlReader);

            ParameterOverride siteParameterParameter = new ParameterOverride("siteParameter", siteParameter);

            IItemReader       itemReader          = Container.Resolve <IItemReader>(itemReaderName, siteParameterParameter);
            ParameterOverride itemReaderParameter = new ParameterOverride("itemReader", itemReader);

            IPageReader pageReader = Container.Resolve <IPageReader>(pageReaderName, siteParameterParameter, htmlReaderParameter, itemReaderParameter);

            IPageParser pageParser = Container.Resolve <IPageParser>(pageParserName, siteParameterParameter, htmlReaderParameter);

            return(new GeneralSiteCrawler(pageReader, pageParser));
        }
Exemplo n.º 2
0
        static string ParseElementOpener(IHtmlReader html)
        {
            if (html.Peek() != '<')
            {
                return(null);
            }

            if (string.Join("", html.Peek(2)) == "</")
            {
                return(null);
            }
            html.Consume();

            var word = new List <char>();

            while (html.Any())
            {
                var c = html.Peek();
                if (!char.IsLetterOrDigit(c) &&
                    c != '!' &&
                    c != '-')
                {
                    break;
                }

                html.Consume();
                word.Add(c);
            }

            return(string.Join("", word));
        }
Exemplo n.º 3
0
 public ParamPageReader(SiteParameter siteParameter, IHtmlReader htmlReader, IItemReader itemReader)
 {
     this.siteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter));
     this.htmlReader    = htmlReader ?? throw new ArgumentNullException(nameof(htmlReader));
     this.itemReader    = itemReader ?? throw new ArgumentNullException(nameof(itemReader));
     this.pageNumber    = this.siteParameter.StartNumber;
 }
Exemplo n.º 4
0
        static string ParseAttributeName(IHtmlReader html)
        {
            var word          = new List <char>();
            var hasWhitespace = false;

            while (html.Any())
            {
                var c = html.Peek();
                if (char.IsWhiteSpace(c))
                {
                    hasWhitespace = word.Any();
                    html.Consume();
                    continue;
                }
                if (!char.IsLetterOrDigit(c) ||
                    hasWhitespace)
                {
                    break;
                }

                html.Consume();
                word.Add(c);
            }

            return(string.Join("", word));
        }
Exemplo n.º 5
0
        public void Initialize()
        {
            HttpMessageHandler myHttpMessageHandler = new FakeHttpHandler();

            httpClient = new HttpClient(myHttpMessageHandler);
            htmlReader = new HtmlReader(httpClient);
        }
Exemplo n.º 6
0
 public Multithread(IHtmlParser htmlParser, IHtmlReader htmlReader, IFileSystemManager fileSystemManager, IDataBaseManager dataBaseManager)
 {
     this.htmlParser        = htmlParser;
     this.htmlReader        = htmlReader;
     this.fileSystemManager = fileSystemManager;
     this.dataBaseManager   = dataBaseManager;
     locker = new object();
     tasks  = new List <Task>();
 }
Exemplo n.º 7
0
        public static HtmlElement ParseElement(IHtmlReader html)
        {
            html.Consume(char.IsWhiteSpace);

            var name = ParseElementOpener(html);

            if (string.IsNullOrWhiteSpace(name))
            {
                return(null);
            }
            var element = new HtmlElement(name);

            if (element.IsDeclaration)
            {
                var closer = DeclarationCloser(name);

                var textElement = ParseTextElement(html, string.Concat(closer, ">"), true);
                element.Children.Add(textElement);
            }
            else
            {
                var attributes = ParseAttributes(html);
                element.Attributes.AddRange(attributes);

                html.Consume(char.IsWhiteSpace);

                if (html.TryConsume("/>", false, true))
                {
                    element.IsClosed = true;
                    return(element);
                }

                html.TryConsume(">", false, true);
                if (element.IsTextOnlyContainer)
                {
                    var textElement = ParseTextElement(
                        html, string.Concat("</", name, ">"), true);
                    if (textElement != null)
                    {
                        element.Children.Add(textElement);
                    }
                }
                else if (!element.IsNonContainer)
                {
                    var children = ParseElements(html);
                    element.Children.AddRange(children);

                    TryConsumeElementCloser(name, html);
                }
            }

            return(element);
        }
Exemplo n.º 8
0
        static IEnumerable <HtmlAttribute> ParseAttributes(IHtmlReader html)
        {
            var items = new List <HtmlAttribute>();

            HtmlAttribute item;

            while ((item = ParseAttribute(html)) != null)
            {
                items.Add(item);
            }

            return(items);
        }
Exemplo n.º 9
0
        static IEnumerable <IHtmlNode> ParseElements(IHtmlReader html)
        {
            var items = new List <IHtmlNode>();

            IHtmlNode item;

            while ((item = ParseElement(html)
                           ?? ParseTextElement(html, "<", false)) != null)
            {
                items.Add(item);
            }

            return(items);
        }
Exemplo n.º 10
0
        /// <summary>
        /// 创建 HtmlContentFragment 对象
        /// </summary>
        /// <param name="reader">HTML 读取分析器</param>
        /// <param name="startIndex">开始位置</param>
        /// <param name="length">内容长度</param>
        public HtmlContentFragment( IHtmlReader reader, int startIndex, int length )
        {
            if ( reader == null )
            throw new ArgumentNullException( "reader" );

              if ( startIndex < 0 )
            throw new ArgumentOutOfRangeException( "startIndex" );

              if ( length <= 0 )
            throw new ArgumentOutOfRangeException( "length" );

              Reader = reader;
              StartIndex = startIndex;
              Length = length;
        }
Exemplo n.º 11
0
        public static HtmlAttribute ParseAttribute(IHtmlReader html)
        {
            var name = ParseAttributeName(html);

            if (string.IsNullOrWhiteSpace(name))
            {
                return(null);
            }

            return(new HtmlAttribute
            {
                Name = name,
                Value = ParseAttributeValue(html)
            });
        }
Exemplo n.º 12
0
        static IHtmlNode ParseTextElement(
            IHtmlReader html,
            string upto, bool consumeTarget)
        {
            string text;

            if (!html.TryConsume(upto, true, consumeTarget, out text) ||
                text == string.Empty)
            {
                return(null);
            }

            return(new HtmlTextElement {
                Value = CollapseWhiteSpace(text)
            });
        }
Exemplo n.º 13
0
        /// <summary>
        /// 创建 HtmlContentFragment 对象
        /// </summary>
        /// <param name="reader">HTML 读取分析器</param>
        /// <param name="startIndex">开始位置</param>
        /// <param name="length">内容长度</param>
        public HtmlContentFragment(IHtmlReader reader, int startIndex, int length)
        {
            if (reader == null)
            {
                throw new ArgumentNullException("reader");
            }

            if (startIndex < 0)
            {
                throw new ArgumentOutOfRangeException("startIndex");
            }

            if (length <= 0)
            {
                throw new ArgumentOutOfRangeException("length");
            }


            Reader     = reader;
            StartIndex = startIndex;
            Length     = length;
        }
Exemplo n.º 14
0
 public HenanHrPageParser(SiteParameter siteParameter, IHtmlReader htmlReader) : base(siteParameter, htmlReader)
 {
 }
Exemplo n.º 15
0
        static string ParseAttributeValue(IHtmlReader html)
        {
            if (html.Peek() != '=')
            {
                return(null);
            }

            html.Consume();

            var word     = new List <char>();
            var quote    = (char)0;
            var switched = false;

            while (html.Any())
            {
                var c = html.Peek();
                if (char.IsWhiteSpace(c) ||
                    c == '>')
                {
                    if (quote == 0)
                    {
                        html.Consume();
                        if (word.Any())
                        {
                            break;
                        }

                        continue;
                    }
                }
                else if (c == '\'' || c == '"')
                {
                    if (!switched)
                    {
                        if (quote == c)
                        {
                            html.Consume();
                            break;
                        }

                        if (quote == 0)
                        {
                            quote = c;
                            html.Consume();
                            continue;
                        }
                    }
                }
                else if (c == '\\' && !switched)
                {
                    switched = true;
                    html.Consume();
                    continue;
                }
                else
                {
                    switched = false;
                }

                html.Consume();
                word.Add(c);
            }

            return(string.Join("", word));
        }
Exemplo n.º 16
0
        static void TryConsumeElementCloser(string name, IHtmlReader html)
        {
            var closer = string.Format("</{0}>", name);

            html.TryConsume(closer, false, true);
        }
Exemplo n.º 17
0
 public JsonPageParser(SiteParameter siteParameter, IHtmlReader htmlReader)
 {
     this.SiteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter));
     this.HtmlReader    = htmlReader ?? throw new ArgumentNullException(nameof(htmlReader));
 }