예제 #1
0
 private void extractv_Click(object sender, EventArgs e)
 {
     // this.webBrowser1.DocumentText = htmlpage.detailinfo.Find(1).html;
     //  Regex.Match("a.b?c。d?", "[?.。?]");
     // this.webBrowser1.Navigate("http://ieeexplore.ieee.org/search/searchresult.jsp?newsearch=true&queryText=information%20extraction&fname=&lname=&title=&volume=&issue=&spage=");
     this.webBrowser1.Navigate("http://www.tesco.com/direct/samsung-ue40ku6020-smart-4k-ultra-hd-40-inch-led-tv-with-freeview-hd/356-2551.prd?skuId=356-2551");
     Docfinished.Start();
     //  nextpage();
     // Console.Out.WriteLine(id);
 }
예제 #2
0
        private void nextpage()
        {
            id++;
            urltable ut = htmlpage.urltable.Find(id);

            if (id >= 2516)
            {
                Application.Exit();
            }
            if (ut != null)
            {
                count = 0;//解析下一页时,count必须清零
                this.webBrowser1.Navigate(ut.url);
                Console.Out.WriteLine("解析的" + id + "对应url:" + ut.url);
                Docfinished.Start();
            }
            else
            {
                nextpage();
            }
        }
예제 #3
0
        //文档加载完之后,对页面进行提取
        private void Docfinished_Tick(object sender, EventArgs e)
        {
            count++;
            //判断关键的html元素是否被加载

            if (this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || (count > 30 && count < 35 && this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive))
            {
                Docfinished.Stop();
                //在每一次处理不同url前需要清空原来存放的节点数据
                adictionary.Clear();
                int i = 0;
                //对于不同的网站meta和title信息可能需要重新抽取
                string title      = null;
                string meta       = null;
                int    vimglink   = 0; //图片
                int    vnlink     = 0; //链接
                int    nstext     = 0; //短文本
                int    nltext     = 0; //长文本
                int    sc         = 0; //长文本块字符串
                int    c          = 0; //短文本块字符串
                int    nparagraph = 0; //段落数
                int    len        = 0; //文本长度
                int    nlink      = 0; //链接数目
                int    nimg       = 0; //图片数目
                int    nt         = 0; //规整的数据结构:表格、列表

                /*  //写入文件测试
                 * StreamReader sr = new StreamReader(webBrowser1.DocumentStream);
                 * StreamWriter sw = new StreamWriter(@"D:\test\test.html");
                 * sw.Write(sr.ReadToEnd());
                 * sw.Flush();
                 * sw.Close();
                 * sr.Close();
                 * return;*/
                //抽取京东页面的title和meta
                foreach (HtmlElement el in this.webBrowser1.Document.GetElementsByTagName("head"))
                {
                    //获取title信息
                    foreach (HtmlElement t in el.GetElementsByTagName("title"))
                    {
                        title = t.InnerText;
                        //Console.Out.WriteLine("title :" + title.InnerText);
                        break;
                    }
                    //获取meta标签信息中的描述信息

                    foreach (HtmlElement m in el.GetElementsByTagName("meta"))
                    {
                        if (m.GetAttribute("name").Equals("keywords"))
                        {
                            meta = m.GetAttribute("content");
                            //  Console.Out.WriteLine("meta :" + meta.GetAttribute("content"));
                            break;
                        }
                        if (m.GetAttribute("name").Equals("description"))
                        {
                            meta = m.GetAttribute("content");
                            //  Console.Out.WriteLine("meta :" + meta.GetAttribute("content"));
                            break;
                        }
                    }
                } //对head信息的提取
                //Console.Out.WriteLine(this.webBrowser1.Document.Body.InnerText);
                //检测body节点中有哪些节点需要分割,打印无需分割的节点
                //Console.Out.WriteLine("打印出孩子节点的数量 :"+this.webBrowser1.Document.Body.OuterHtml);
                //Console.Out.WriteLine("需要分割??? :"+ needsplit(this.webBrowser1.Document.Body));
                Vextra(this.webBrowser1.Document.Body);   //该函数是递归的
                foreach (HtmlElement ele in adictionary.Keys)
                {
                    i++;
                    String[] type = judgetype(ele);
                    // Console.Out.WriteLine("第 :" + i+"个无需分割节点为~~~~~~~~~~~~~~~~~~~~~~类型为"+ type);
                    Console.Out.WriteLine("第" + i + "个元素   " + type[0] + "   " + adictionary[ele]);    //打印节点内部信息
                    if (type[0].Equals("imglink"))
                    {
                        vimglink++;
                    }
                    else
                    if (type[0].Equals("alink"))
                    {
                        vnlink++;
                    }
                    else
                    if (type[0].Equals("stext"))
                    {
                        nstext++; //短文本
                        sc = sc + int.Parse(type[1]);
                    }
                    else
                    {
                        nltext++;
                        c = c + int.Parse(type[1]);
                    }
                }

                /*
                 * Console.Out.WriteLine("图片:" + vimglink);
                 * Console.Out.WriteLine("链接:" + vnlink);
                 * Console.Out.WriteLine("短文本:" + nstext);
                 * Console.Out.WriteLine("长文本:" + nltext);
                 * if (nstext > 0)
                 * {
                 *  Console.Out.WriteLine("短文本长度" + sc);
                 *  Console.Out.WriteLine("短文本平均长度:" + (float)sc / (float)nstext);
                 * }
                 * else
                 * {
                 *  Console.Out.WriteLine("短文本平均长度:" + 0);
                 * }
                 * if (nltext > 0)
                 * {
                 *  Console.Out.WriteLine("长文本长度" + c);
                 *  Console.Out.WriteLine("长文本平均长度:" + (float)c / (float)nltext);
                 * }
                 * else {
                 *  Console.Out.WriteLine("长文本平均长度:" + 0);
                 * } */



                /*
                 *
                 *          //上述为对视觉信息的处理
                 *          nparagraph= this.webBrowser1.Document.Body.GetElementsByTagName("p").Count;
                 *          len=Regex.Replace(this.webBrowser1.Document.Body.InnerText, "\\s*", "").Length;
                 *          nlink = this.webBrowser1.Document.Body.GetElementsByTagName("a").Count;
                 *          nimg = this.webBrowser1.Document.Body.GetElementsByTagName("img").Count;
                 *          nt= this.webBrowser1.Document.Body.GetElementsByTagName("table").Count+ this.webBrowser1.Document.Body.GetElementsByTagName("ul").Count+this.webBrowser1.Document.Body.GetElementsByTagName("ol").Count;
                 */
                /*
                 * Console.Out.WriteLine("nspecical:" + nparagraph);
                 * Console.Out.WriteLine("nspecical:" + len);
                 * Console.Out.WriteLine("nspecical:" + nlink);
                 * Console.Out.WriteLine("nspecical:" + nimg);
                 * Console.Out.WriteLine("nspecical:" + nt); */
                //上述为对html网页元素的分析

                //所有操作处理完之后进行数据库存储

                /*
                 * if ((vimglink+ vnlink+ nstext+ nltext)<3 || title==null) //认为分块不成功,或者页面没有正确加载,舍弃
                 * {
                 *     //添加直接解析下一页代码
                 *     nextpage();
                 *     MessageBox.Show("网络问题,未能正确解析!!!!!!!!!!!!!");
                 *     Application.Exit();
                 *     return;
                 * }
                 * urltable ut = htmlpage.urltable.Find(id);
                 * detailinfo detail = new detailinfo();
                 * //文档的url(修)
                 * detail.url = ut.url;
                 * detail.site = ut.site;
                 * //文档的类别,根据抽取的内容来定(修)
                 * detail.mark = ut.mark;
                 * detail.title = title;
                 * detail.meta = meta;
                 * detail.vimglink = vimglink;
                 * detail.vnlink = vnlink;
                 * detail.nstext = nstext;
                 * detail.nltext = nltext;
                 * detail.nlink = nlink;
                 * detail.nparagraph = nparagraph;
                 * detail.len = len;
                 * detail.nimg = nimg;
                 * detail.nt = nt;
                 * detail.html = this.webBrowser1.DocumentText;  */
                //  Console.Out.WriteLine("url" + ut.url);
                //  Console.Out.WriteLine("site:" + ut.site);
                // Console.Out.WriteLine("mark:" + ut.mark);
                // Console.Out.WriteLine("title:" + title);
                //  Console.Out.WriteLine("meta:" + meta);
                // Console.Out.WriteLine("vimglink:" + vimglink);
                // Console.Out.WriteLine("vnlink:" + vnlink);
                // Console.Out.WriteLine("nstext:" + nstext);
                // Console.Out.WriteLine("nltext:" + nltext);
                // Console.Out.WriteLine("nlink:" + nlink);
                //Console.Out.WriteLine("nparagraph:" + nparagraph);
                // Console.Out.WriteLine("len:" + len);
                // Console.Out.WriteLine("nimg:" + nimg);
                // Console.Out.WriteLine("nt:" + nt);

                /*
                 * try   //写回到数据库中
                 * {
                 *
                 * htmlpage.detailinfo.Add(detail);
                 *  htmlpage.SaveChanges();
                 * }
                 * catch(DbEntityValidationException dbEx) {
                 *  MessageBox.Show(dbEx.Message);
                 * }*/

                //        nextpage();
                return;
                // htmlpage.detailinfo.Find(1).html;
            }//文档加载完成后的处理


            if (count >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取
            {
                Docfinished.Stop();
                Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码");
                nextpage();
                return;
            }
        }