예제 #1
0
        private void nextSample() //从original表格中得到样本用于概念计算,存放在类变量sampleString中;
        {
            originalid++;
            original toriginal = htmlpage.original.Find(originalid);

            if (originalid > originalendid)   //大于不包括需要抽取的节点时程序退出
            {
                Application.Exit();
            }
            if (toriginal != null)  //该originalid有对应的取值
            {
                sampleString = toriginal.info;
                Console.Out.WriteLine("对照的样本  :" + sampleString);
            }
            else
            {
                originalid++;  //如果该id对应的urltable元组不存在则记录抽取的extractid自增,抽取下一条;
                nextSample();
            }
        }
예제 #2
0
        private void extract_timer_Tick(object sender, EventArgs e)
        {
            count2++;
            if (count2 >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取
            {
                extract_timer.Stop();
                Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码");
                extractid++;
                nextextraturl(extractid);
                return;
            }
            if (this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive)
            {
                extract_timer.Stop();
                //string strPath = @"D:\testtest.txt";
                //string value = this.webBrowser1.Document.Body.OuterHtml;

                //if (!Directory.Exists(Path.GetDirectoryName(strPath)))
                //{
                //    Directory.CreateDirectory(Path.GetDirectoryName(strPath));
                //}
                //       File.AppendAllText(strPath, value, Encoding.Default);
                foreach (HtmlElement el in this.webBrowser1.Document.Body.GetElementsByTagName(tag))
                {
                    bool isRight = true;
                    // Console.Out.WriteLine("tag"+ tag +"attr:  "+ attrs[0].Value);
                    foreach (var item in attrs)
                    {
                        // Console.Out.WriteLine("name   " + item.Name);
                        //  Console.Out.WriteLine("value   " + item.Value);
                        String attr = item.Name;
                        if (item.Name.Equals("classname") || item.Name.Equals("class"))
                        {
                            attr = "className";
                        }
                        //    Console.Out.WriteLine(el.GetAttribute(attr));

                        if (!item.Value.Equals(el.GetAttribute(attr)))
                        {
                            isRight = false;  //如果有一个节点属性不能匹配则结束循环不在匹配后续属性
                            break;
                        }
                    }
                    if (isRight)  //如果所有节点属性都匹配成功则进行抽取
                    {
                        if (el.InnerText != null && el.InnerText.Trim().Length > 0)
                        {
                            //int urlid = extractid;
                            //String info =Regex.Replace(el.InnerText.Trim(),"\\s+"," ");
                            ////site、extracturl已经在提取url是设置;
                            //String doc = this.webBrowser1.DocumentText;
                            //Console.Out.WriteLine("urlid:    "+urlid + "抽取信息: " + info + " 网站: " + site + "来源url " + extracturl + "分类: " + mark);
                            original orig = new original();
                            orig.info  = Regex.Replace(el.InnerText.Trim(), "\\s+", " ");
                            orig.mark  = mark;
                            orig.site  = site;
                            orig.url   = extracturl;
                            orig.urlid = extractid;
                            orig.doc   = this.webBrowser1.DocumentText;
                            htmlpage.original.Add(orig);
                            htmlpage.SaveChanges();
                        }
                    }
                }
                extractid++;
                nextextraturl(extractid);
            }
            // public void extract
        }