private RawlerBase CreateRawler(HtmlNode node)
        {
            RawlerBase rawler = null;

            bool flag次のノードを調べる = true;
            if (RawlerExpressLib.Automation.Extend.HTMLExtend.TargetAnalyzeTag.Contains(node.Name))
            {
                Tags tags = new Tags() { Tag = node.Name };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.Attributes.Where(n => n.Name == "id").Any())
                {
                    tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite() { Attribute = GetCategoryName() + tags.ClassName });
                    flag次のノードを調べる = false;
                }
                if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
                {
                    tags.TagVisbleType = TagVisbleType.Outer;
                    rawler = tags.Add(new ImageLinks() { ImageType = ImageType.BackgroundImage }).DataWrite(CategoryName + "." + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }

                rawler = tags;
            }
            else if (node.Name == "a")
            {
                var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault<RawlerLib.Web.Link, string>(n => n.Url, null);
                if (url != null)
                {
                    {
                        rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange(
                            new Links() { VisbleType = LinkVisbleType.Url }.DataWrite(GetCategoryName() + node.GetClassName() + "_Link").GetRoot(),
                            new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(GetCategoryName() + node.GetClassName() + "_Label").GetRoot()
                        );

                    }
                }
                else
                {
                    //URLがないAタグの場合。
                    Tags tags = new Tags() { Tag = node.Name };
                    if (node.Attributes.Where(n => n.Name == "class").Any())
                    {
                        tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                    }
                    if (node.Attributes.Where(n => n.Name == "id").Any())
                    {
                        tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                    }
                    rawler = tags;
                }
                if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    flag次のノードを調べる = false;
                }

            }
            else if (node.Name == "img")
            {

                var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null);
                if (url != null)
                {
                    rawler = new ImageLinks().DataWrite(GetCategoryName() + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }

            }
            ///背景画像に反応させる。
            else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
            {
                rawler = new ImageLinks() { ImageType = ImageType.BackgroundImage }.DataWrite(GetCategoryName() + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
            }
            else if (node.Name == "span")
            {
                Tags tags = new Tags() { Tag = node.Name };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite() { Attribute = GetCategoryName() + tags.ClassName });
                    flag次のノードを調べる = false;
                }

                rawler = tags;
            }
            else
            {
                var t = node.OuterHtml.Replace("\n", "").Trim();
                if (t.Length > 0)
                {
                    rawler = new TagClear().Trim().Add(new DataWrite() { Attribute = GetCategoryName() + node.GetClassName() + "_" + node.Name }).GetRoot();
                    if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                    {
                        flag次のノードを調べる = false;
                    }
                }
            }
            if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any())
            {
                rawler.AddChildren(new DataWrite() { Attribute = GetCategoryName() + node.GetClassName() });
            }

            foreach (var item in node.ChildNodes)
            {
                if (flag次のノードを調べる)
                {
                    var r = CreateRawler(item);

                    if (r != null && rawler != null)
                    {
                        rawler.AddChildren(r);
                    }
                    else
                    {
                        if (r != null && rawler == null)
                        {
                            rawler = r;
                        }
                    }
                }

            }
            return rawler;
        }
Ejemplo n.º 2
0
        public RawlerBase CreateRawler(HtmlNode node)
        {
            var baseUrl = BaseUrl;
            RawlerBase rawler = null;

            bool flag次のノードを調べる = true;
            if (targetTag.Contains(node.Name))
            {
                Tags tags = new Tags() { Tag = node.Name };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.Attributes.Where(n => n.Name == "id").Any())
                {
                    tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite() { Attribute = tags.ClassName });
                    flag次のノードを調べる = false;
                }
                if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
                {
                    tags.TagVisbleType = TagVisbleType.Outer;
                    rawler = tags.Add(new ImageLinks() { ImageType = ImageType.BackgroundImage }).DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }

                rawler = tags;
            }
            else if (node.Name == "a")
            {
                var resultUrlTFIDF = urlTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetLink().Select(n => n.Url));
                var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault<RawlerLib.Web.Link, string>(n => n.Url, null);
                if (url != null)
                {
                    //IDF が0以下の時、すべてのドキュメントで存在する。
                    if (urlTfidf.IDFDic.GetValueOrDefault(url) !=null && urlTfidf.IDFDic.GetValueOrDefault(url).Value <= 0)
                    {
                        rawler = null;
                        flag次のノードを調べる = false;
                    }
                    else
                    {
                        if (resultUrlTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any())
                        {
                            rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange(
                                new Links() { VisbleType = LinkVisbleType.Url }.DataWrite(node.GetClassName() + "_MainLink", DataAttributeType.Url).GetRoot());
                            if (node.ChildNodes.Count == 1 && node.ChildNodes.First().Name == "#text")
                            {
                                rawler.Add(new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_MainLabel").GetRoot());
                            }
                        }
                        else
                        {
                            rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange(
                                new Links() { VisbleType = LinkVisbleType.Url, Enable = enableGetSubUrlLink }.DataWrite(node.GetClassName() + "_SubLink").GetRoot(),
                                new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_SubLabel").GetRoot()
                            );

                        }
                    }
                }
                else
                {
                    //URLがないAタグの場合。
                    Tags tags = new Tags() { Tag = node.Name };
                    if (node.Attributes.Where(n => n.Name == "class").Any())
                    {
                        tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                    }
                    if (node.Attributes.Where(n => n.Name == "id").Any())
                    {
                        tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                    }
                    rawler = tags;
                }
                if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    flag次のノードを調べる = false;
                }

            }
            else if (node.Name == "img")
            {
                var resultImgeTFIDF = imageTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetImageLink().Select(n => n.Url));

                var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null);
                if (url != null)
                {
                    if (imageTfidf.IDFDic.Count>0 && imageTfidf.IDFDic.GetValueOrDefault(url).Value <= 0)
                    {
                        rawler = null;
                        flag次のノードを調べる = false;
                    }
                    else
                    {
                        if (resultImgeTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any())
                        {
                            rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                        }
                        else
                        {
                            if (iconImageColumn)
                            {
                                rawler = new DataWrite() { AttributeTree = new ImageLinks() { VisbleType = LinkVisbleType.Label }, Value = "1" };
                            }
                            else
                            {
                                rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Icon", DataAttributeType.Image).GetRoot();
                            }
                        }
                    }
                }
            }
            ///背景画像に反応させる。
            else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
            {
                rawler = new ImageLinks() { ImageType = ImageType.BackgroundImage }.DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
            }
            else if (node.Name == "span")
            {
                Tags tags = new Tags() { Tag = node.Name };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite() { Attribute = tags.ClassName });
                    flag次のノードを調べる = false;
                }

                rawler = tags;
            }
            else if (node.Name == "#comment")
            {
                flag次のノードを調べる = false;
            }
            else
            {
                var t = node.OuterHtml.Replace("\n", "").Trim();
                if (t.Length > 0)
                {
                    rawler = new TagClear().Trim().Add(new DataWrite() { Attribute = node.GetClassName() + "_" + node.Name }).GetRoot();
                    if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                    {
                        flag次のノードを調べる = false;
                    }
                }
            }
            if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any())
            {
                rawler.AddChildren(new DataWrite() { Attribute = node.GetClassName() });
            }

            foreach (var item in node.ChildNodes)
            {
                if (flag次のノードを調べる)
                {
                    var r = CreateRawler(item);

                    if (r != null && rawler != null)
                    {
                        rawler.AddChildren(r);
                    }
                    else
                    {
                        if (r != null && rawler == null)
                        {
                            rawler = r;
                        }
                    }
                }

            }
            return rawler;
        }