public static ISiteCrawler Create(SiteParameter siteParameter) { if (siteParameter == null) { throw new ArgumentNullException(nameof(siteParameter)); } string dataServiceName = GetValueOrDefault(siteParameter.CustomProcessors, "IDataService"); string htmlReaderName = GetValueOrDefault(siteParameter.CustomProcessors, "IHtmlReader"); string pageParserName = GetValueOrDefault(siteParameter.CustomProcessors, "IPageParser"); string itemReaderName = GetValueOrDefault(siteParameter.CustomProcessors, "IItemReader"); string pageReaderName = GetValueOrDefault(siteParameter.CustomProcessors, "IPageReader"); IHtmlReader htmlReader = Container.Resolve <IHtmlReader>(htmlReaderName); ParameterOverride htmlReaderParameter = new ParameterOverride("htmlReader", htmlReader); ParameterOverride siteParameterParameter = new ParameterOverride("siteParameter", siteParameter); IItemReader itemReader = Container.Resolve <IItemReader>(itemReaderName, siteParameterParameter); ParameterOverride itemReaderParameter = new ParameterOverride("itemReader", itemReader); IPageReader pageReader = Container.Resolve <IPageReader>(pageReaderName, siteParameterParameter, htmlReaderParameter, itemReaderParameter); IPageParser pageParser = Container.Resolve <IPageParser>(pageParserName, siteParameterParameter, htmlReaderParameter); return(new GeneralSiteCrawler(pageReader, pageParser)); }
static string ParseElementOpener(IHtmlReader html) { if (html.Peek() != '<') { return(null); } if (string.Join("", html.Peek(2)) == "</") { return(null); } html.Consume(); var word = new List <char>(); while (html.Any()) { var c = html.Peek(); if (!char.IsLetterOrDigit(c) && c != '!' && c != '-') { break; } html.Consume(); word.Add(c); } return(string.Join("", word)); }
public ParamPageReader(SiteParameter siteParameter, IHtmlReader htmlReader, IItemReader itemReader) { this.siteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter)); this.htmlReader = htmlReader ?? throw new ArgumentNullException(nameof(htmlReader)); this.itemReader = itemReader ?? throw new ArgumentNullException(nameof(itemReader)); this.pageNumber = this.siteParameter.StartNumber; }
static string ParseAttributeName(IHtmlReader html) { var word = new List <char>(); var hasWhitespace = false; while (html.Any()) { var c = html.Peek(); if (char.IsWhiteSpace(c)) { hasWhitespace = word.Any(); html.Consume(); continue; } if (!char.IsLetterOrDigit(c) || hasWhitespace) { break; } html.Consume(); word.Add(c); } return(string.Join("", word)); }
public void Initialize() { HttpMessageHandler myHttpMessageHandler = new FakeHttpHandler(); httpClient = new HttpClient(myHttpMessageHandler); htmlReader = new HtmlReader(httpClient); }
public Multithread(IHtmlParser htmlParser, IHtmlReader htmlReader, IFileSystemManager fileSystemManager, IDataBaseManager dataBaseManager) { this.htmlParser = htmlParser; this.htmlReader = htmlReader; this.fileSystemManager = fileSystemManager; this.dataBaseManager = dataBaseManager; locker = new object(); tasks = new List <Task>(); }
public static HtmlElement ParseElement(IHtmlReader html) { html.Consume(char.IsWhiteSpace); var name = ParseElementOpener(html); if (string.IsNullOrWhiteSpace(name)) { return(null); } var element = new HtmlElement(name); if (element.IsDeclaration) { var closer = DeclarationCloser(name); var textElement = ParseTextElement(html, string.Concat(closer, ">"), true); element.Children.Add(textElement); } else { var attributes = ParseAttributes(html); element.Attributes.AddRange(attributes); html.Consume(char.IsWhiteSpace); if (html.TryConsume("/>", false, true)) { element.IsClosed = true; return(element); } html.TryConsume(">", false, true); if (element.IsTextOnlyContainer) { var textElement = ParseTextElement( html, string.Concat("</", name, ">"), true); if (textElement != null) { element.Children.Add(textElement); } } else if (!element.IsNonContainer) { var children = ParseElements(html); element.Children.AddRange(children); TryConsumeElementCloser(name, html); } } return(element); }
static IEnumerable <HtmlAttribute> ParseAttributes(IHtmlReader html) { var items = new List <HtmlAttribute>(); HtmlAttribute item; while ((item = ParseAttribute(html)) != null) { items.Add(item); } return(items); }
static IEnumerable <IHtmlNode> ParseElements(IHtmlReader html) { var items = new List <IHtmlNode>(); IHtmlNode item; while ((item = ParseElement(html) ?? ParseTextElement(html, "<", false)) != null) { items.Add(item); } return(items); }
/// <summary> /// 创建 HtmlContentFragment 对象 /// </summary> /// <param name="reader">HTML 读取分析器</param> /// <param name="startIndex">开始位置</param> /// <param name="length">内容长度</param> public HtmlContentFragment( IHtmlReader reader, int startIndex, int length ) { if ( reader == null ) throw new ArgumentNullException( "reader" ); if ( startIndex < 0 ) throw new ArgumentOutOfRangeException( "startIndex" ); if ( length <= 0 ) throw new ArgumentOutOfRangeException( "length" ); Reader = reader; StartIndex = startIndex; Length = length; }
public static HtmlAttribute ParseAttribute(IHtmlReader html) { var name = ParseAttributeName(html); if (string.IsNullOrWhiteSpace(name)) { return(null); } return(new HtmlAttribute { Name = name, Value = ParseAttributeValue(html) }); }
static IHtmlNode ParseTextElement( IHtmlReader html, string upto, bool consumeTarget) { string text; if (!html.TryConsume(upto, true, consumeTarget, out text) || text == string.Empty) { return(null); } return(new HtmlTextElement { Value = CollapseWhiteSpace(text) }); }
/// <summary> /// 创建 HtmlContentFragment 对象 /// </summary> /// <param name="reader">HTML 读取分析器</param> /// <param name="startIndex">开始位置</param> /// <param name="length">内容长度</param> public HtmlContentFragment(IHtmlReader reader, int startIndex, int length) { if (reader == null) { throw new ArgumentNullException("reader"); } if (startIndex < 0) { throw new ArgumentOutOfRangeException("startIndex"); } if (length <= 0) { throw new ArgumentOutOfRangeException("length"); } Reader = reader; StartIndex = startIndex; Length = length; }
public HenanHrPageParser(SiteParameter siteParameter, IHtmlReader htmlReader) : base(siteParameter, htmlReader) { }
static string ParseAttributeValue(IHtmlReader html) { if (html.Peek() != '=') { return(null); } html.Consume(); var word = new List <char>(); var quote = (char)0; var switched = false; while (html.Any()) { var c = html.Peek(); if (char.IsWhiteSpace(c) || c == '>') { if (quote == 0) { html.Consume(); if (word.Any()) { break; } continue; } } else if (c == '\'' || c == '"') { if (!switched) { if (quote == c) { html.Consume(); break; } if (quote == 0) { quote = c; html.Consume(); continue; } } } else if (c == '\\' && !switched) { switched = true; html.Consume(); continue; } else { switched = false; } html.Consume(); word.Add(c); } return(string.Join("", word)); }
static void TryConsumeElementCloser(string name, IHtmlReader html) { var closer = string.Format("</{0}>", name); html.TryConsume(closer, false, true); }
public JsonPageParser(SiteParameter siteParameter, IHtmlReader htmlReader) { this.SiteParameter = siteParameter ?? throw new ArgumentNullException(nameof(siteParameter)); this.HtmlReader = htmlReader ?? throw new ArgumentNullException(nameof(htmlReader)); }