private void extractv_Click(object sender, EventArgs e) { // this.webBrowser1.DocumentText = htmlpage.detailinfo.Find(1).html; // Regex.Match("a.b?c。d?", "[?.。?]"); // this.webBrowser1.Navigate("http://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=information%20extraction&fname=&lname=&title=&volume=&issue=&spage="); this.webBrowser1.Navigate("http://www.tesco.com/direct/samsung-ue40ku6020-smart-4k-ultra-hd-40-inch-led-tv-with-freeview-hd/356-2551.prd?skuId=356-2551"); Docfinished.Start(); // nextpage(); // Console.Out.WriteLine(id); }
private void nextpage() { id++; urltable ut = htmlpage.urltable.Find(id); if (id >= 2516) { Application.Exit(); } if (ut != null) { count = 0;//解析下一页时,count必须清零 this.webBrowser1.Navigate(ut.url); Console.Out.WriteLine("解析的" + id + "对应url:" + ut.url); Docfinished.Start(); } else { nextpage(); } }
//文档加载完之后,对页面进行提取 private void Docfinished_Tick(object sender, EventArgs e) { count++; //判断关键的html元素是否被加载 if (this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || (count > 30 && count < 35 && this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive)) { Docfinished.Stop(); //在每一次处理不同url前需要清空原来存放的节点数据 adictionary.Clear(); int i = 0; //对于不同的网站meta和title信息可能需要重新抽取 string title = null; string meta = null; int vimglink = 0; //图片 int vnlink = 0; //链接 int nstext = 0; //短文本 int nltext = 0; //长文本 int sc = 0; //长文本块字符串 int c = 0; //短文本块字符串 int nparagraph = 0; //段落数 int len = 0; //文本长度 int nlink = 0; //链接数目 int nimg = 0; //图片数目 int nt = 0; //规整的数据结构:表格、列表 /* //写入文件测试 * StreamReader sr = new StreamReader(webBrowser1.DocumentStream); * StreamWriter sw = new StreamWriter(@"D:\test\test.html"); * sw.Write(sr.ReadToEnd()); * sw.Flush(); * sw.Close(); * sr.Close(); * return;*/ //抽取京东页面的title和meta foreach (HtmlElement el in this.webBrowser1.Document.GetElementsByTagName("head")) { //获取title信息 foreach (HtmlElement t in el.GetElementsByTagName("title")) { title = t.InnerText; //Console.Out.WriteLine("title :" + title.InnerText); break; } //获取meta标签信息中的描述信息 foreach (HtmlElement m in el.GetElementsByTagName("meta")) { if (m.GetAttribute("name").Equals("keywords")) { meta = m.GetAttribute("content"); // Console.Out.WriteLine("meta :" + meta.GetAttribute("content")); break; } if (m.GetAttribute("name").Equals("description")) { meta = m.GetAttribute("content"); // Console.Out.WriteLine("meta :" + meta.GetAttribute("content")); break; } } } //对head信息的提取 //Console.Out.WriteLine(this.webBrowser1.Document.Body.InnerText); //检测body节点中有哪些节点需要分割,打印无需分割的节点 //Console.Out.WriteLine("打印出孩子节点的数量 :"+this.webBrowser1.Document.Body.OuterHtml); //Console.Out.WriteLine("需要分割??? :"+ needsplit(this.webBrowser1.Document.Body)); Vextra(this.webBrowser1.Document.Body); //该函数是递归的 foreach (HtmlElement ele in adictionary.Keys) { i++; String[] type = judgetype(ele); // Console.Out.WriteLine("第 :" + i+"个无需分割节点为~~~~~~~~~~~~~~~~~~~~~~类型为"+ type); Console.Out.WriteLine("第" + i + "个元素 " + type[0] + " " + adictionary[ele]); //打印节点内部信息 if (type[0].Equals("imglink")) { vimglink++; } else if (type[0].Equals("alink")) { vnlink++; } else if (type[0].Equals("stext")) { nstext++; //短文本 sc = sc + int.Parse(type[1]); } else { nltext++; c = c + int.Parse(type[1]); } } /* * Console.Out.WriteLine("图片:" + vimglink); * Console.Out.WriteLine("链接:" + vnlink); * Console.Out.WriteLine("短文本:" + nstext); * Console.Out.WriteLine("长文本:" + nltext); * if (nstext > 0) * { * Console.Out.WriteLine("短文本长度" + sc); * Console.Out.WriteLine("短文本平均长度:" + (float)sc / (float)nstext); * } * else * { * Console.Out.WriteLine("短文本平均长度:" + 0); * } * if (nltext > 0) * { * Console.Out.WriteLine("长文本长度" + c); * Console.Out.WriteLine("长文本平均长度:" + (float)c / (float)nltext); * } * else { * Console.Out.WriteLine("长文本平均长度:" + 0); * } */ /* * * //上述为对视觉信息的处理 * nparagraph= this.webBrowser1.Document.Body.GetElementsByTagName("p").Count; * len=Regex.Replace(this.webBrowser1.Document.Body.InnerText, "\\s*", "").Length; * nlink = this.webBrowser1.Document.Body.GetElementsByTagName("a").Count; * nimg = this.webBrowser1.Document.Body.GetElementsByTagName("img").Count; * nt= this.webBrowser1.Document.Body.GetElementsByTagName("table").Count+ this.webBrowser1.Document.Body.GetElementsByTagName("ul").Count+this.webBrowser1.Document.Body.GetElementsByTagName("ol").Count; */ /* * Console.Out.WriteLine("nspecical:" + nparagraph); * Console.Out.WriteLine("nspecical:" + len); * Console.Out.WriteLine("nspecical:" + nlink); * Console.Out.WriteLine("nspecical:" + nimg); * Console.Out.WriteLine("nspecical:" + nt); */ //上述为对html网页元素的分析 //所有操作处理完之后进行数据库存储 /* * if ((vimglink+ vnlink+ nstext+ nltext)<3 || title==null) //认为分块不成功,或者页面没有正确加载,舍弃 * { * //添加直接解析下一页代码 * nextpage(); * MessageBox.Show("网络问题,未能正确解析!!!!!!!!!!!!!"); * Application.Exit(); * return; * } * urltable ut = htmlpage.urltable.Find(id); * detailinfo detail = new detailinfo(); * //文档的url(修) * detail.url = ut.url; * detail.site = ut.site; * //文档的类别,根据抽取的内容来定(修) * detail.mark = ut.mark; * detail.title = title; * detail.meta = meta; * detail.vimglink = vimglink; * detail.vnlink = vnlink; * detail.nstext = nstext; * detail.nltext = nltext; * detail.nlink = nlink; * detail.nparagraph = nparagraph; * detail.len = len; * detail.nimg = nimg; * detail.nt = nt; * detail.html = this.webBrowser1.DocumentText; */ // Console.Out.WriteLine("url" + ut.url); // Console.Out.WriteLine("site:" + ut.site); // Console.Out.WriteLine("mark:" + ut.mark); // Console.Out.WriteLine("title:" + title); // Console.Out.WriteLine("meta:" + meta); // Console.Out.WriteLine("vimglink:" + vimglink); // Console.Out.WriteLine("vnlink:" + vnlink); // Console.Out.WriteLine("nstext:" + nstext); // Console.Out.WriteLine("nltext:" + nltext); // Console.Out.WriteLine("nlink:" + nlink); //Console.Out.WriteLine("nparagraph:" + nparagraph); // Console.Out.WriteLine("len:" + len); // Console.Out.WriteLine("nimg:" + nimg); // Console.Out.WriteLine("nt:" + nt); /* * try //写回到数据库中 * { * * htmlpage.detailinfo.Add(detail); * htmlpage.SaveChanges(); * } * catch(DbEntityValidationException dbEx) { * MessageBox.Show(dbEx.Message); * }*/ // nextpage(); return; // htmlpage.detailinfo.Find(1).html; }//文档加载完成后的处理 if (count >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取 { Docfinished.Stop(); Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码"); nextpage(); return; } }