Exemple #1
0
        /// <summary>
        /// クローンを作る
        /// </summary>
        /// <param name="parent"></param>
        /// <returns></returns>
        public override RawlerBase Clone(RawlerBase parent)
        {
            var clone = new DataWrite();

            RawlerLib.ObjectLib.FildCopy(this, clone);
            clone.SetParent(parent);
            this.CloneEvent(clone);
            clone.children.Clear();
            foreach (var item in this.Children)
            {
                var child = item.Clone(clone);
                clone.AddChildren(child);
            }
            return(clone);
        }
Exemple #2
0
        /// <summary>
        /// このクラスでの実行すること。
        /// </summary>
        /// <param name="runChildren"></param>
        public override void Run(bool runChildren)
        {
            string t = GetText();

            if (string.IsNullOrEmpty(Json) == false)
            {
                t = Json;
            }
            var j = JObject.Parse(t);

            foreach (var item in j.Properties())
            {
                if (item.Value != null)
                {
                    if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Null)
                    {
                        SetText("Null");
                    }
                    else if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Object)
                    {
                        DataWriteAllJsonData all = new DataWriteAllJsonData();
                        all.SetParent(this);
                        all.Json         = item.Value.ToString();
                        all.PropertyName = this.PropertyName.NullIsEmpty() + item.Name + ".";
                        all.Run();
                    }
                    else
                    {
                        SetText(item.Value.ToString());
                    }
                }
                else
                {
                    SetText("Null");
                }

                DataWrite dataWrite = new DataWrite();
                dataWrite.SetParent(this);
                dataWrite.Attribute = PropertyName.NullIsEmpty() + item.Name;
                dataWrite.Run();
            }
            base.Run(runChildren);
        }
Exemple #3
0
 /// <summary>
 /// クローンを作る
 /// </summary>
 /// <param name="parent"></param>
 /// <returns></returns>
 public override RawlerBase Clone(RawlerBase parent)
 {
     var clone = new DataWrite();
     RawlerLib.ObjectLib.FildCopy(this, clone);
     clone.SetParent(parent);
     this.CloneEvent(clone);
     clone.children.Clear();
     foreach (var item in this.Children)
     {
         var child = item.Clone(clone);
         clone.AddChildren(child);
     }
     return clone;
 }
        private void CreateRawler()
        {
            Tool.Data data = new Tool.Data();
            root.Rawler = data;
            Tool.Page page = new Tool.Page();
            data.AddChildren(page);
            page.Url = "http://www.imdb.com/search/name?star_sign=aquarius&sort=starmeter,asc";
            //   page.Url = "http://www.imdb.com/search/name?sort=starmeter,asc&star_sign=aquarius&start=10251";
            page.Comment = "一覧ページ読み込み";



            Tool.TagExtraction tag1 = new Tool.TagExtraction();
            tag1.Tag             = "table";
            tag1.ParameterFilter = "class=\"results\"";
            page.AddChildren(tag1);
            Tool.TagExtraction tag2 = new Tool.TagExtraction();
            tag2.Tag             = "tr";
            tag2.IsMulti         = true;
            tag2.ParameterFilter = "detailed";
            tag1.AddChildren(tag2);
            Tool.Link link = new Tool.Link();
            link.UseAbsolutetLink = true;
            link.UrlFilter        = "/name/";
            link.VisbleType       = Tool.LinkVisbleType.Url;
            tag2.AddChildren(link);

            Tool.Page page1 = new Tool.Page();
            link.AddChildren(page1);
            //飛んだ先のページ

            Tool.TagExtraction tag3 = new Tool.TagExtraction();
            page1.AddChildren(tag3);


            tag3.Tag             = "table";
            tag3.ParameterFilter = "id=\"name-overview-widget-layout\"";

            Tool.ClipText clipText1 = new Tool.ClipText();
            tag3.AddChildren(clipText1);
            clipText1.StartClip = "<h1 class=\"header\">";
            clipText1.EndClip   = "</h1>";
            Tool.DataWrite write2 = new Tool.DataWrite();
            clipText1.AddChildren(new Tool.DataWrite("Name"));
            clipText1.AddChildren(new Tool.Report("Person:", ""));
            Tool.ClipText clip2 = new Tool.ClipText("<div class=\"infobar\">", "</div>");
            tag3.AddChildren(clip2);

            Tool.Link link2 = new Tool.Link();
            clip2.AddChildren(link2);
            link2.VisbleType = Tool.LinkVisbleType.Label;
            link2.IsMulti    = true;
            link2.AddChildren(new Tool.DataWrite("role"));

            Tool.ClipText clip3 = new Tool.ClipText("<div class=\"txt-block\">", "</div>");
            clip3.IsMulti = true;
            tag3.AddChildren(clip3);
            Tool.Contains contains1 = new Tool.Contains("<h4 class=\"inline\">Born:</h4>");
            clip3.AddChildren(contains1);

            Tool.Link link3 = new Tool.Link("/search/name?birth_year", "", false);
            link3.VisbleType = Tool.LinkVisbleType.Label;
            link3.AddChildren(new Tool.DataWrite("born-year"));

            Tool.Link link4 = new Tool.Link("/date/", "", false);
            link4.VisbleType = Tool.LinkVisbleType.Label;
            link4.AddChildren(new Tool.DataWrite("born-date"));

            Tool.Link link5 = new Tool.Link("/search/name?birth_place=", "", false);
            link5.VisbleType = Tool.LinkVisbleType.Label;
            link5.AddChildren(new Tool.DataWrite("born-place"));

            contains1.AddChildren(link3);
            contains1.AddChildren(link4);
            contains1.AddChildren(link5);
            page1.AddChildren(new Tool.NextDataRow());



            //Tool.DataWrite write1 = new Tool.DataWrite();
            //link.AddChildren(write1);
            //Tool.NextDataRow nextData = new Tool.NextDataRow();
            //write1.AddChildren(nextData);


            Tool.Link nLink = new Tool.Link();
            nLink.UrlFilter        = "search/name?";
            nLink.LabelFilter      = "Next";
            nLink.UseAbsolutetLink = true;
            page.AddChildren(nLink);
            Tool.NextPage nextPage = new Tool.NextPage();
            nLink.AddChildren(nextPage);



//            MyLib.ObjectLib.SaveToBinaryFile(data, "data.xml");
            // MyLib.ObjectLib.SaveXML(data, data.GetType(), "data.xml");
            //textBox1.Text = MyLib.IO.TextFileRead("data.xml", Encoding.UTF8);
        }
        private void CreateRawler()
        {
            Tool.Data data = new Tool.Data();
            root.Rawler = data;
            Tool.Page page = new Tool.Page();
            data.AddChildren(page);
            page.Url = "http://www.imdb.com/search/name?star_sign=aquarius&sort=starmeter,asc";
             //   page.Url = "http://www.imdb.com/search/name?sort=starmeter,asc&star_sign=aquarius&start=10251";
            page.Comment = "一覧ページ読み込み";

            Tool.TagExtraction tag1 = new Tool.TagExtraction();
            tag1.Tag = "table";
            tag1.ParameterFilter = "class=\"results\"";
            page.AddChildren(tag1);
            Tool.TagExtraction tag2 = new Tool.TagExtraction();
            tag2.Tag = "tr";
            tag2.IsMulti = true;
            tag2.ParameterFilter = "detailed";
            tag1.AddChildren(tag2);
            Tool.Link link = new Tool.Link();
            link.UseAbsolutetLink = true;
            link.UrlFilter = "/name/";
            link.VisbleType = Tool.LinkVisbleType.Url;
            tag2.AddChildren(link);

            Tool.Page page1 = new Tool.Page();
            link.AddChildren(page1);
            //飛んだ先のページ

            Tool.TagExtraction tag3 = new Tool.TagExtraction();
            page1.AddChildren(tag3);

            tag3.Tag = "table";
            tag3.ParameterFilter = "id=\"name-overview-widget-layout\"";

            Tool.ClipText clipText1 = new Tool.ClipText();
            tag3.AddChildren(clipText1);
            clipText1.StartClip = "<h1 class=\"header\">";
            clipText1.EndClip = "</h1>";
            Tool.DataWrite write2 = new Tool.DataWrite();
            clipText1.AddChildren(new Tool.DataWrite("Name"));
            clipText1.AddChildren(new Tool.Report("Person:", ""));
            Tool.ClipText clip2 = new Tool.ClipText("<div class=\"infobar\">", "</div>");
            tag3.AddChildren(clip2);

            Tool.Link link2 = new Tool.Link();
            clip2.AddChildren(link2);
            link2.VisbleType = Tool.LinkVisbleType.Label;
            link2.IsMulti = true;
            link2.AddChildren(new Tool.DataWrite("role"));

            Tool.ClipText clip3 = new Tool.ClipText("<div class=\"txt-block\">", "</div>");
            clip3.IsMulti = true;
            tag3.AddChildren(clip3);
            Tool.Contains contains1 = new Tool.Contains("<h4 class=\"inline\">Born:</h4>");
            clip3.AddChildren(contains1);

            Tool.Link link3 = new Tool.Link("/search/name?birth_year", "", false);
            link3.VisbleType = Tool.LinkVisbleType.Label;
            link3.AddChildren(new Tool.DataWrite("born-year"));

            Tool.Link link4 = new Tool.Link("/date/", "", false);
            link4.VisbleType = Tool.LinkVisbleType.Label;
            link4.AddChildren(new Tool.DataWrite("born-date"));

            Tool.Link link5 = new Tool.Link("/search/name?birth_place=", "", false);
            link5.VisbleType = Tool.LinkVisbleType.Label;
            link5.AddChildren(new Tool.DataWrite("born-place"));

            contains1.AddChildren(link3);
            contains1.AddChildren(link4);
            contains1.AddChildren(link5);
            page1.AddChildren(new Tool.NextDataRow());

            //Tool.DataWrite write1 = new Tool.DataWrite();
            //link.AddChildren(write1);
            //Tool.NextDataRow nextData = new Tool.NextDataRow();
            //write1.AddChildren(nextData);

            Tool.Link nLink = new Tool.Link();
            nLink.UrlFilter = "search/name?";
            nLink.LabelFilter = "Next";
            nLink.UseAbsolutetLink = true;
            page.AddChildren(nLink);
            Tool.NextPage nextPage = new Tool.NextPage();
            nLink.AddChildren(nextPage);

            //            MyLib.ObjectLib.SaveToBinaryFile(data, "data.xml");
               // MyLib.ObjectLib.SaveXML(data, data.GetType(), "data.xml");
            //textBox1.Text = MyLib.IO.TextFileRead("data.xml", Encoding.UTF8);
        }
        public RawlerBase CreateRawler(HtmlNode node)
        {
            var baseUrl = BaseUrl;
            RawlerBase rawler = null;

            bool flag次のノードを調べる = true;
            if (targetTag.Contains(node.Name))
            {
                Tags tags = new Tags() { Tag = node.Name };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.Attributes.Where(n => n.Name == "id").Any())
                {
                    tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite() { Attribute = tags.ClassName });
                    flag次のノードを調べる = false;
                }
                if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
                {
                    tags.TagVisbleType = TagVisbleType.Outer;
                    rawler = tags.Add(new ImageLinks() { ImageType = ImageType.BackgroundImage }).DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                }

                rawler = tags;
            }
            else if (node.Name == "a")
            {
                var resultUrlTFIDF = urlTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetLink().Select(n => n.Url));
                var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault<RawlerLib.Web.Link, string>(n => n.Url, null);
                if (url != null)
                {
                    //IDF が0以下の時、すべてのドキュメントで存在する。
                    if (urlTfidf.IDFDic.GetValueOrDefault(url) !=null && urlTfidf.IDFDic.GetValueOrDefault(url).Value <= 0)
                    {
                        rawler = null;
                        flag次のノードを調べる = false;
                    }
                    else
                    {
                        if (resultUrlTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any())
                        {
                            rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange(
                                new Links() { VisbleType = LinkVisbleType.Url }.DataWrite(node.GetClassName() + "_MainLink", DataAttributeType.Url).GetRoot());
                            if (node.ChildNodes.Count == 1 && node.ChildNodes.First().Name == "#text")
                            {
                                rawler.Add(new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_MainLabel").GetRoot());
                            }
                        }
                        else
                        {
                            rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange(
                                new Links() { VisbleType = LinkVisbleType.Url, Enable = enableGetSubUrlLink }.DataWrite(node.GetClassName() + "_SubLink").GetRoot(),
                                new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_SubLabel").GetRoot()
                            );

                        }
                    }
                }
                else
                {
                    //URLがないAタグの場合。
                    Tags tags = new Tags() { Tag = node.Name };
                    if (node.Attributes.Where(n => n.Name == "class").Any())
                    {
                        tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                    }
                    if (node.Attributes.Where(n => n.Name == "id").Any())
                    {
                        tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value;
                    }
                    rawler = tags;
                }
                if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    flag次のノードを調べる = false;
                }

            }
            else if (node.Name == "img")
            {
                var resultImgeTFIDF = imageTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetImageLink().Select(n => n.Url));

                var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null);
                if (url != null)
                {
                    if (imageTfidf.IDFDic.Count>0 && imageTfidf.IDFDic.GetValueOrDefault(url).Value <= 0)
                    {
                        rawler = null;
                        flag次のノードを調べる = false;
                    }
                    else
                    {
                        if (resultImgeTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any())
                        {
                            rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
                        }
                        else
                        {
                            if (iconImageColumn)
                            {
                                rawler = new DataWrite() { AttributeTree = new ImageLinks() { VisbleType = LinkVisbleType.Label }, Value = "1" };
                            }
                            else
                            {
                                rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Icon", DataAttributeType.Image).GetRoot();
                            }
                        }
                    }
                }
            }
            ///背景画像に反応させる。
            else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any())
            {
                rawler = new ImageLinks() { ImageType = ImageType.BackgroundImage }.DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot();
            }
            else if (node.Name == "span")
            {
                Tags tags = new Tags() { Tag = node.Name };
                if (node.Attributes.Where(n => n.Name == "class").Any())
                {
                    tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value;
                }
                if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                {
                    tags.AddChildren(new DataWrite() { Attribute = tags.ClassName });
                    flag次のノードを調べる = false;
                }

                rawler = tags;
            }
            else if (node.Name == "#comment")
            {
                flag次のノードを調べる = false;
            }
            else
            {
                var t = node.OuterHtml.Replace("\n", "").Trim();
                if (t.Length > 0)
                {
                    rawler = new TagClear().Trim().Add(new DataWrite() { Attribute = node.GetClassName() + "_" + node.Name }).GetRoot();
                    if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any())
                    {
                        flag次のノードを調べる = false;
                    }
                }
            }
            if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any())
            {
                rawler.AddChildren(new DataWrite() { Attribute = node.GetClassName() });
            }

            foreach (var item in node.ChildNodes)
            {
                if (flag次のノードを調べる)
                {
                    var r = CreateRawler(item);

                    if (r != null && rawler != null)
                    {
                        rawler.AddChildren(r);
                    }
                    else
                    {
                        if (r != null && rawler == null)
                        {
                            rawler = r;
                        }
                    }
                }

            }
            return rawler;
        }
Exemple #7
0
        /// <summary>
        /// このクラスでの実行すること。
        /// </summary>
        /// <param name="runChildren"></param>
        public override void Run(bool runChildren)
        {
            string t = GetText();
            if (string.IsNullOrEmpty(Json) == false) t = Json;
            var j = JObject.Parse(t);

            foreach (var item in j.Properties())
            {
                if (item.Value != null)
                {
                    if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Null)
                    {
                        SetText("Null");
                    }
                    else if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Object)
                    {
                        DataWriteAllJsonData all = new DataWriteAllJsonData();
                        all.SetParent(this);
                        all.Json = item.Value.ToString();
                        all.PropertyName = this.PropertyName.NullIsEmpty() + item.Name + ".";
                        all.Run();
                    }
                    else
                    {
                        SetText(item.Value.ToString());
                    }
                }
                else
                {
                    SetText("Null");
                }

                DataWrite dataWrite = new DataWrite();
                dataWrite.SetParent(this);
                dataWrite.Attribute = PropertyName.NullIsEmpty()+ item.Name;
                dataWrite.Run();

            }
            base.Run(runChildren);
        }