private RawlerBase CreateRawler(HtmlNode node) { RawlerBase rawler = null; bool flag次のノードを調べる = true; if (RawlerExpressLib.Automation.Extend.HTMLExtend.TargetAnalyzeTag.Contains(node.Name)) { Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.Attributes.Where(n => n.Name == "id").Any()) { tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value; } if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { tags.AddChildren(new DataWrite() { Attribute = GetCategoryName() + tags.ClassName }); flag次のノードを調べる = false; } if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any()) { tags.TagVisbleType = TagVisbleType.Outer; rawler = tags.Add(new ImageLinks() { ImageType = ImageType.BackgroundImage }).DataWrite(CategoryName + "." + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } rawler = tags; } else if (node.Name == "a") { var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault<RawlerLib.Web.Link, string>(n => n.Url, null); if (url != null) { { rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange( new Links() { VisbleType = LinkVisbleType.Url }.DataWrite(GetCategoryName() + node.GetClassName() + "_Link").GetRoot(), new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(GetCategoryName() + node.GetClassName() + "_Label").GetRoot() ); } } else { //URLがないAタグの場合。 Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.Attributes.Where(n => n.Name == "id").Any()) { tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value; } rawler = tags; } if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { flag次のノードを調べる = false; } } else if (node.Name == "img") { var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null); if (url != null) { rawler = new ImageLinks().DataWrite(GetCategoryName() + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } } ///背景画像に反応させる。 else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any()) { rawler = new ImageLinks() { ImageType = ImageType.BackgroundImage }.DataWrite(GetCategoryName() + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } else if (node.Name == "span") { Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { tags.AddChildren(new DataWrite() { Attribute = GetCategoryName() + tags.ClassName }); flag次のノードを調べる = false; } rawler = tags; } else { var t = node.OuterHtml.Replace("\n", "").Trim(); if (t.Length > 0) { rawler = new TagClear().Trim().Add(new DataWrite() { Attribute = GetCategoryName() + node.GetClassName() + "_" + node.Name }).GetRoot(); if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { flag次のノードを調べる = false; } } } if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any()) { rawler.AddChildren(new DataWrite() { Attribute = GetCategoryName() + node.GetClassName() }); } foreach (var item in node.ChildNodes) { if (flag次のノードを調べる) { var r = CreateRawler(item); if (r != null && rawler != null) { rawler.AddChildren(r); } else { if (r != null && rawler == null) { rawler = r; } } } } return rawler; }
public RawlerBase CreateRawler(HtmlNode node) { var baseUrl = BaseUrl; RawlerBase rawler = null; bool flag次のノードを調べる = true; if (targetTag.Contains(node.Name)) { Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.Attributes.Where(n => n.Name == "id").Any()) { tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value; } if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { tags.AddChildren(new DataWrite() { Attribute = tags.ClassName }); flag次のノードを調べる = false; } if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any()) { tags.TagVisbleType = TagVisbleType.Outer; rawler = tags.Add(new ImageLinks() { ImageType = ImageType.BackgroundImage }).DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } rawler = tags; } else if (node.Name == "a") { var resultUrlTFIDF = urlTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetLink().Select(n => n.Url)); var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault<RawlerLib.Web.Link, string>(n => n.Url, null); if (url != null) { //IDF が0以下の時、すべてのドキュメントで存在する。 if (urlTfidf.IDFDic.GetValueOrDefault(url) !=null && urlTfidf.IDFDic.GetValueOrDefault(url).Value <= 0) { rawler = null; flag次のノードを調べる = false; } else { if (resultUrlTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any()) { rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange( new Links() { VisbleType = LinkVisbleType.Url }.DataWrite(node.GetClassName() + "_MainLink", DataAttributeType.Url).GetRoot()); if (node.ChildNodes.Count == 1 && node.ChildNodes.First().Name == "#text") { rawler.Add(new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_MainLabel").GetRoot()); } } else { rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange( new Links() { VisbleType = LinkVisbleType.Url, Enable = enableGetSubUrlLink }.DataWrite(node.GetClassName() + "_SubLink").GetRoot(), new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_SubLabel").GetRoot() ); } } } else { //URLがないAタグの場合。 Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.Attributes.Where(n => n.Name == "id").Any()) { tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value; } rawler = tags; } if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { flag次のノードを調べる = false; } } else if (node.Name == "img") { var resultImgeTFIDF = imageTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetImageLink().Select(n => n.Url)); var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null); if (url != null) { if (imageTfidf.IDFDic.Count>0 && imageTfidf.IDFDic.GetValueOrDefault(url).Value <= 0) { rawler = null; flag次のノードを調べる = false; } else { if (resultImgeTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any()) { rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } else { if (iconImageColumn) { rawler = new DataWrite() { AttributeTree = new ImageLinks() { VisbleType = LinkVisbleType.Label }, Value = "1" }; } else { rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Icon", DataAttributeType.Image).GetRoot(); } } } } } ///背景画像に反応させる。 else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any()) { rawler = new ImageLinks() { ImageType = ImageType.BackgroundImage }.DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } else if (node.Name == "span") { Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { tags.AddChildren(new DataWrite() { Attribute = tags.ClassName }); flag次のノードを調べる = false; } rawler = tags; } else if (node.Name == "#comment") { flag次のノードを調べる = false; } else { var t = node.OuterHtml.Replace("\n", "").Trim(); if (t.Length > 0) { rawler = new TagClear().Trim().Add(new DataWrite() { Attribute = node.GetClassName() + "_" + node.Name }).GetRoot(); if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { flag次のノードを調べる = false; } } } if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any()) { rawler.AddChildren(new DataWrite() { Attribute = node.GetClassName() }); } foreach (var item in node.ChildNodes) { if (flag次のノードを調べる) { var r = CreateRawler(item); if (r != null && rawler != null) { rawler.AddChildren(r); } else { if (r != null && rawler == null) { rawler = r; } } } } return rawler; }