/// <summary> /// クローンを作る /// </summary> /// <param name="parent"></param> /// <returns></returns> public override RawlerBase Clone(RawlerBase parent) { var clone = new DataWrite(); RawlerLib.ObjectLib.FildCopy(this, clone); clone.SetParent(parent); this.CloneEvent(clone); clone.children.Clear(); foreach (var item in this.Children) { var child = item.Clone(clone); clone.AddChildren(child); } return(clone); }
/// <summary> /// このクラスでの実行すること。 /// </summary> /// <param name="runChildren"></param> public override void Run(bool runChildren) { string t = GetText(); if (string.IsNullOrEmpty(Json) == false) { t = Json; } var j = JObject.Parse(t); foreach (var item in j.Properties()) { if (item.Value != null) { if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Null) { SetText("Null"); } else if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Object) { DataWriteAllJsonData all = new DataWriteAllJsonData(); all.SetParent(this); all.Json = item.Value.ToString(); all.PropertyName = this.PropertyName.NullIsEmpty() + item.Name + "."; all.Run(); } else { SetText(item.Value.ToString()); } } else { SetText("Null"); } DataWrite dataWrite = new DataWrite(); dataWrite.SetParent(this); dataWrite.Attribute = PropertyName.NullIsEmpty() + item.Name; dataWrite.Run(); } base.Run(runChildren); }
/// <summary> /// クローンを作る /// </summary> /// <param name="parent"></param> /// <returns></returns> public override RawlerBase Clone(RawlerBase parent) { var clone = new DataWrite(); RawlerLib.ObjectLib.FildCopy(this, clone); clone.SetParent(parent); this.CloneEvent(clone); clone.children.Clear(); foreach (var item in this.Children) { var child = item.Clone(clone); clone.AddChildren(child); } return clone; }
private void CreateRawler() { Tool.Data data = new Tool.Data(); root.Rawler = data; Tool.Page page = new Tool.Page(); data.AddChildren(page); page.Url = "http://www.imdb.com/search/name?star_sign=aquarius&sort=starmeter,asc"; // page.Url = "http://www.imdb.com/search/name?sort=starmeter,asc&star_sign=aquarius&start=10251"; page.Comment = "一覧ページ読み込み"; Tool.TagExtraction tag1 = new Tool.TagExtraction(); tag1.Tag = "table"; tag1.ParameterFilter = "class=\"results\""; page.AddChildren(tag1); Tool.TagExtraction tag2 = new Tool.TagExtraction(); tag2.Tag = "tr"; tag2.IsMulti = true; tag2.ParameterFilter = "detailed"; tag1.AddChildren(tag2); Tool.Link link = new Tool.Link(); link.UseAbsolutetLink = true; link.UrlFilter = "/name/"; link.VisbleType = Tool.LinkVisbleType.Url; tag2.AddChildren(link); Tool.Page page1 = new Tool.Page(); link.AddChildren(page1); //飛んだ先のページ Tool.TagExtraction tag3 = new Tool.TagExtraction(); page1.AddChildren(tag3); tag3.Tag = "table"; tag3.ParameterFilter = "id=\"name-overview-widget-layout\""; Tool.ClipText clipText1 = new Tool.ClipText(); tag3.AddChildren(clipText1); clipText1.StartClip = "<h1 class=\"header\">"; clipText1.EndClip = "</h1>"; Tool.DataWrite write2 = new Tool.DataWrite(); clipText1.AddChildren(new Tool.DataWrite("Name")); clipText1.AddChildren(new Tool.Report("Person:", "")); Tool.ClipText clip2 = new Tool.ClipText("<div class=\"infobar\">", "</div>"); tag3.AddChildren(clip2); Tool.Link link2 = new Tool.Link(); clip2.AddChildren(link2); link2.VisbleType = Tool.LinkVisbleType.Label; link2.IsMulti = true; link2.AddChildren(new Tool.DataWrite("role")); Tool.ClipText clip3 = new Tool.ClipText("<div class=\"txt-block\">", "</div>"); clip3.IsMulti = true; tag3.AddChildren(clip3); Tool.Contains contains1 = new Tool.Contains("<h4 class=\"inline\">Born:</h4>"); clip3.AddChildren(contains1); Tool.Link link3 = new Tool.Link("/search/name?birth_year", "", false); link3.VisbleType = Tool.LinkVisbleType.Label; link3.AddChildren(new Tool.DataWrite("born-year")); Tool.Link link4 = new Tool.Link("/date/", "", false); link4.VisbleType = Tool.LinkVisbleType.Label; link4.AddChildren(new Tool.DataWrite("born-date")); Tool.Link link5 = new Tool.Link("/search/name?birth_place=", "", false); link5.VisbleType = Tool.LinkVisbleType.Label; link5.AddChildren(new Tool.DataWrite("born-place")); contains1.AddChildren(link3); contains1.AddChildren(link4); contains1.AddChildren(link5); page1.AddChildren(new Tool.NextDataRow()); //Tool.DataWrite write1 = new Tool.DataWrite(); //link.AddChildren(write1); //Tool.NextDataRow nextData = new Tool.NextDataRow(); //write1.AddChildren(nextData); Tool.Link nLink = new Tool.Link(); nLink.UrlFilter = "search/name?"; nLink.LabelFilter = "Next"; nLink.UseAbsolutetLink = true; page.AddChildren(nLink); Tool.NextPage nextPage = new Tool.NextPage(); nLink.AddChildren(nextPage); // MyLib.ObjectLib.SaveToBinaryFile(data, "data.xml"); // MyLib.ObjectLib.SaveXML(data, data.GetType(), "data.xml"); //textBox1.Text = MyLib.IO.TextFileRead("data.xml", Encoding.UTF8); }
public RawlerBase CreateRawler(HtmlNode node) { var baseUrl = BaseUrl; RawlerBase rawler = null; bool flag次のノードを調べる = true; if (targetTag.Contains(node.Name)) { Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.Attributes.Where(n => n.Name == "id").Any()) { tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value; } if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { tags.AddChildren(new DataWrite() { Attribute = tags.ClassName }); flag次のノードを調べる = false; } if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any()) { tags.TagVisbleType = TagVisbleType.Outer; rawler = tags.Add(new ImageLinks() { ImageType = ImageType.BackgroundImage }).DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } rawler = tags; } else if (node.Name == "a") { var resultUrlTFIDF = urlTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetLink().Select(n => n.Url)); var url = node.OuterHtml.ToHtml(baseUrl).GetLink().FirstDefault<RawlerLib.Web.Link, string>(n => n.Url, null); if (url != null) { //IDF が0以下の時、すべてのドキュメントで存在する。 if (urlTfidf.IDFDic.GetValueOrDefault(url) !=null && urlTfidf.IDFDic.GetValueOrDefault(url).Value <= 0) { rawler = null; flag次のノードを調べる = false; } else { if (resultUrlTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any()) { rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange( new Links() { VisbleType = LinkVisbleType.Url }.DataWrite(node.GetClassName() + "_MainLink", DataAttributeType.Url).GetRoot()); if (node.ChildNodes.Count == 1 && node.ChildNodes.First().Name == "#text") { rawler.Add(new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_MainLabel").GetRoot()); } } else { rawler = new Links() { VisbleType = LinkVisbleType.Tag }.AddRange( new Links() { VisbleType = LinkVisbleType.Url, Enable = enableGetSubUrlLink }.DataWrite(node.GetClassName() + "_SubLink").GetRoot(), new Links() { VisbleType = LinkVisbleType.Label }.DataWrite(node.GetClassName() + "_SubLabel").GetRoot() ); } } } else { //URLがないAタグの場合。 Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.Attributes.Where(n => n.Name == "id").Any()) { tags.IdName = node.Attributes.Where(n => n.Name == "id").First().Value; } rawler = tags; } if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { flag次のノードを調べる = false; } } else if (node.Name == "img") { var resultImgeTFIDF = imageTfidf.GetResult(node.OuterHtml.ToHtml(baseUrl).GetImageLink().Select(n => n.Url)); var url = node.OuterHtml.ToHtml(baseUrl).GetImageLink().FirstDefault(n => n.Url, null); if (url != null) { if (imageTfidf.IDFDic.Count>0 && imageTfidf.IDFDic.GetValueOrDefault(url).Value <= 0) { rawler = null; flag次のノードを調べる = false; } else { if (resultImgeTFIDF.GetTakeTopValue(n => n.TFIDF).Where(n => n.Word == url).Any()) { rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } else { if (iconImageColumn) { rawler = new DataWrite() { AttributeTree = new ImageLinks() { VisbleType = LinkVisbleType.Label }, Value = "1" }; } else { rawler = new ImageLinks().DataWrite(node.GetClassName() + "_Icon", DataAttributeType.Image).GetRoot(); } } } } } ///背景画像に反応させる。 else if (node.Attributes.Where(n => n.Name == "style" && n.Value.Contains("background")).Any()) { rawler = new ImageLinks() { ImageType = ImageType.BackgroundImage }.DataWrite(node.GetClassName() + "_Image", DataAttributeType.Image).GetRoot(); } else if (node.Name == "span") { Tags tags = new Tags() { Tag = node.Name }; if (node.Attributes.Where(n => n.Name == "class").Any()) { tags.ClassName = node.Attributes.Where(n => n.Name == "class").First().Value; } if (node.ChildNodes.Count() == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { tags.AddChildren(new DataWrite() { Attribute = tags.ClassName }); flag次のノードを調べる = false; } rawler = tags; } else if (node.Name == "#comment") { flag次のノードを調べる = false; } else { var t = node.OuterHtml.Replace("\n", "").Trim(); if (t.Length > 0) { rawler = new TagClear().Trim().Add(new DataWrite() { Attribute = node.GetClassName() + "_" + node.Name }).GetRoot(); if (node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "#text").Any()) { flag次のノードを調べる = false; } } } if (rawler != null && node.ChildNodes.Count == 1 && node.ChildNodes.Where(n => n.Name == "span").Any()) { rawler.AddChildren(new DataWrite() { Attribute = node.GetClassName() }); } foreach (var item in node.ChildNodes) { if (flag次のノードを調べる) { var r = CreateRawler(item); if (r != null && rawler != null) { rawler.AddChildren(r); } else { if (r != null && rawler == null) { rawler = r; } } } } return rawler; }
/// <summary> /// このクラスでの実行すること。 /// </summary> /// <param name="runChildren"></param> public override void Run(bool runChildren) { string t = GetText(); if (string.IsNullOrEmpty(Json) == false) t = Json; var j = JObject.Parse(t); foreach (var item in j.Properties()) { if (item.Value != null) { if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Null) { SetText("Null"); } else if (item.Value.Type == Newtonsoft.Json.Linq.JTokenType.Object) { DataWriteAllJsonData all = new DataWriteAllJsonData(); all.SetParent(this); all.Json = item.Value.ToString(); all.PropertyName = this.PropertyName.NullIsEmpty() + item.Name + "."; all.Run(); } else { SetText(item.Value.ToString()); } } else { SetText("Null"); } DataWrite dataWrite = new DataWrite(); dataWrite.SetParent(this); dataWrite.Attribute = PropertyName.NullIsEmpty()+ item.Name; dataWrite.Run(); } base.Run(runChildren); }