Exemple #1
0
        public RawlerBase CreateRawler(HtmlNode node)
        {
            var        baseUrl = BaseUrl;
            RawlerBase rawler  = null;

            bool flag次のノードを調べる = true;

            if (targetTag.Contains(node.Name))
            {
                Tags tags = new Tags()
                {
                    Tag = node.Name
                };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.Attributes.Where(n => n.Name == "id").Any())
                {
                    tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite()
                    {
                        Attribute = tags.ClassName
                    });
                    flag次のノードを調べる = false;
                }
                if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
                {
                    tags.TagVisbleType = TagVisbleType.Outer;
                    rawler             = tags.Add(new ImageLinks()
                    {
                        ImageType = ImageType.BackgroundImage
                    }).DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }

                rawler = tags;
            }
            else if (node.Name == "a")
            {
                var resultUrlTFIDF = urlTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetLink().Select(n => n.Url));
                var url            = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault <RawlerLib.Web.Link, string>(n => n.Url, null);
                if (url != null)
                {
                    //IDF が0以下の時、すべてのドキュメントで存在する。
                    if (urlTfidf.IDFDic.GetValueOrDefault(url) != null && urlTfidf.IDFDic.GetValueOrDefault(url).Value <= 0)
                    {
                        rawler        = null;
                        flag次のノードを調べる = false;
                    }
                    else
                    {
                        if (resultUrlTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any())
                        {
                            rawler = new Links()
                            {
                                VisbleType = LinkVisbleType.Tag
                            }.AddRange(
                                new Links()
                            {
                                VisbleType = LinkVisbleType.Url
                            }.DataWrite(node.GetClassName() + "_MainLink", DataAttributeType.Url).GetRoot());
                            if (node.ChildNodes.Count == 1 && node.ChildNodes.First().Name == "#text")
                            {
                                rawler.Add(new Links()
                                {
                                    VisbleType = LinkVisbleType.Label
                                }.DataWrite(node.GetClassName() + "_MainLabel").GetRoot());
                            }
                        }
                        else
                        {
                            rawler = new Links()
                            {
                                VisbleType = LinkVisbleType.Tag
                            }.AddRange(
                                new Links()
                            {
                                VisbleType = LinkVisbleType.Url, Enable = enableGetSubUrlLink
                            }.DataWrite(node.GetClassName() + "_SubLink").GetRoot(),
                                new Links()
                            {
                                VisbleType = LinkVisbleType.Label
                            }.DataWrite(node.GetClassName() + "_SubLabel").GetRoot()
                                );
                        }
                    }
                }
                else
                {
                    //URLがないAタグの場合。
                    Tags tags = new Tags()
                    {
                        Tag = node.Name
                    };
                    if (node.Attributes.Where(n => n.Name == "class").Any())
                    {
                        tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                    }
                    if (node.Attributes.Where(n => n.Name == "id").Any())
                    {
                        tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                    }
                    rawler = tags;
                }
                if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    flag次のノードを調べる = false;
                }
            }
            else if (node.Name == "img")
            {
                var resultImgeTFIDF = imageTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetImageLink().Select(n => n.Url));

                var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null);
                if (url != null)
                {
                    if (imageTfidf.IDFDic.Count > 0 && imageTfidf.IDFDic.GetValueOrDefault(url).Value <= 0)
                    {
                        rawler        = null;
                        flag次のノードを調べる = false;
                    }
                    else
                    {
                        if (resultImgeTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any())
                        {
                            rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                        }
                        else
                        {
                            if (iconImageColumn)
                            {
                                rawler = new DataWrite()
                                {
                                    AttributeTree = new ImageLinks()
                                    {
                                        VisbleType = LinkVisbleType.Label
                                    }, Value = "1"
                                };
                            }
                            else
                            {
                                rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Icon", DataAttributeType.Image).GetRoot();
                            }
                        }
                    }
                }
            }
            ///背景画像に反応させる。
            else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
            {
                rawler = new ImageLinks()
                {
                    ImageType = ImageType.BackgroundImage
                }.DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
            }
            else if (node.Name == "span")
            {
                Tags tags = new Tags()
                {
                    Tag = node.Name
                };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite()
                    {
                        Attribute = tags.ClassName
                    });
                    flag次のノードを調べる = false;
                }

                rawler = tags;
            }
            else if (node.Name == "#comment")
            {
                flag次のノードを調べる = false;
            }
            else
            {
                var t = node.OuterHtml.Replace("\n", "").Trim();
                if (t.Length > 0)
                {
                    rawler = new TagClear().Trim().Add(new DataWrite()
                    {
                        Attribute = node.GetClassName() + "_" + node.Name
                    }).GetRoot();
                    if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                    {
                        flag次のノードを調べる = false;
                    }
                }
            }
            if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any())
            {
                rawler.AddChildren(new DataWrite()
                {
                    Attribute = node.GetClassName()
                });
            }

            foreach (var item in node.ChildNodes)
            {
                if (flag次のノードを調べる)
                {
                    var r = CreateRawler(item);

                    if (r != null && rawler != null)
                    {
                        rawler.AddChildren(r);
                    }
                    else
                    {
                        if (r != null && rawler == null)
                        {
                            rawler = r;
                        }
                    }
                }
            }
            return(rawler);
        }
        private RawlerBase CreateRawler(HtmlNode node)
        {
            RawlerBase rawler = null;

            bool flag次のノードを調べる = true;

            if (RawlerExpressLib.Automation.Extend.HTMLExtend.TargetAnalyzeTag.Contains(node.Name))
            {
                Tags tags = new Tags()
                {
                    Tag = node.Name
                };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.Attributes.Where(n => n.Name == "id").Any())
                {
                    tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite()
                    {
                        Attribute = GetCategoryName() + tags.ClassName
                    });
                    flag次のノードを調べる = false;
                }
                if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
                {
                    tags.TagVisbleType = TagVisbleType.Outer;
                    rawler             = tags.Add(new ImageLinks()
                    {
                        ImageType = ImageType.BackgroundImage
                    }).DataWrite(CategoryName + "." + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }

                rawler = tags;
            }
            else if (node.Name == "a")
            {
                var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault <RawlerLib.Web.Link, string>(n => n.Url, null);
                if (url != null)
                {
                    {
                        rawler = new Links()
                        {
                            VisbleType = LinkVisbleType.Tag
                        }.AddRange(
                            new Links()
                        {
                            VisbleType = LinkVisbleType.Url
                        }.DataWrite(GetCategoryName() + node.GetClassName() + "_Link").GetRoot(),
                            new Links()
                        {
                            VisbleType = LinkVisbleType.Label
                        }.DataWrite(GetCategoryName() + node.GetClassName() + "_Label").GetRoot()
                            );
                    }
                }
                else
                {
                    //URLがないAタグの場合。
                    Tags tags = new Tags()
                    {
                        Tag = node.Name
                    };
                    if (node.Attributes.Where(n => n.Name == "class").Any())
                    {
                        tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                    }
                    if (node.Attributes.Where(n => n.Name == "id").Any())
                    {
                        tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                    }
                    rawler = tags;
                }
                if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    flag次のノードを調べる = false;
                }
            }
            else if (node.Name == "img")
            {
                var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null);
                if (url != null)
                {
                    rawler = new ImageLinks().DataWrite(GetCategoryName() + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }
            }
            ///背景画像に反応させる。
            else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
            {
                rawler = new ImageLinks()
                {
                    ImageType = ImageType.BackgroundImage
                }.DataWrite(GetCategoryName() + node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
            }
            else if (node.Name == "span")
            {
                Tags tags = new Tags()
                {
                    Tag = node.Name
                };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite()
                    {
                        Attribute = GetCategoryName() + tags.ClassName
                    });
                    flag次のノードを調べる = false;
                }

                rawler = tags;
            }
            else
            {
                var t = node.OuterHtml.Replace("\n", "").Trim();
                if (t.Length > 0)
                {
                    rawler = new TagClear().Trim().Add(new DataWrite()
                    {
                        Attribute = GetCategoryName() + node.GetClassName() + "_" + node.Name
                    }).GetRoot();
                    if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                    {
                        flag次のノードを調べる = false;
                    }
                }
            }
            if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any())
            {
                rawler.AddChildren(new DataWrite()
                {
                    Attribute = GetCategoryName() + node.GetClassName()
                });
            }

            foreach (var item in node.ChildNodes)
            {
                if (flag次のノードを調べる)
                {
                    var r = CreateRawler(item);

                    if (r != null && rawler != null)
                    {
                        rawler.AddChildren(r);
                    }
                    else
                    {
                        if (r != null && rawler == null)
                        {
                            rawler = r;
                        }
                    }
                }
            }
            return(rawler);
        }