private void nextSample() //从original表格中得到样本用于概念计算,存放在类变量sampleString中; { originalid++; original toriginal = htmlpage.original.Find(originalid); if (originalid > originalendid) //大于不包括需要抽取的节点时程序退出 { Application.Exit(); } if (toriginal != null) //该originalid有对应的取值 { sampleString = toriginal.info; Console.Out.WriteLine("对照的样本 :" + sampleString); } else { originalid++; //如果该id对应的urltable元组不存在则记录抽取的extractid自增,抽取下一条; nextSample(); } }
private void extract_timer_Tick(object sender, EventArgs e) { count2++; if (count2 >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取 { extract_timer.Stop(); Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码"); extractid++; nextextraturl(extractid); return; } if (this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive) { extract_timer.Stop(); //string strPath = @"D:\testtest.txt"; //string value = this.webBrowser1.Document.Body.OuterHtml; //if (!Directory.Exists(Path.GetDirectoryName(strPath))) //{ // Directory.CreateDirectory(Path.GetDirectoryName(strPath)); //} // File.AppendAllText(strPath, value, Encoding.Default); foreach (HtmlElement el in this.webBrowser1.Document.Body.GetElementsByTagName(tag)) { bool isRight = true; // Console.Out.WriteLine("tag"+ tag +"attr: "+ attrs[0].Value); foreach (var item in attrs) { // Console.Out.WriteLine("name " + item.Name); // Console.Out.WriteLine("value " + item.Value); String attr = item.Name; if (item.Name.Equals("classname") || item.Name.Equals("class")) { attr = "className"; } // Console.Out.WriteLine(el.GetAttribute(attr)); if (!item.Value.Equals(el.GetAttribute(attr))) { isRight = false; //如果有一个节点属性不能匹配则结束循环不在匹配后续属性 break; } } if (isRight) //如果所有节点属性都匹配成功则进行抽取 { if (el.InnerText != null && el.InnerText.Trim().Length > 0) { //int urlid = extractid; //String info =Regex.Replace(el.InnerText.Trim(),"\\s+"," "); ////site、extracturl已经在提取url是设置; //String doc = this.webBrowser1.DocumentText; //Console.Out.WriteLine("urlid: "+urlid + "抽取信息: " + info + " 网站: " + site + "来源url " + extracturl + "分类: " + mark); original orig = new original(); orig.info = Regex.Replace(el.InnerText.Trim(), "\\s+", " "); orig.mark = mark; orig.site = site; orig.url = extracturl; orig.urlid = extractid; orig.doc = this.webBrowser1.DocumentText; htmlpage.original.Add(orig); htmlpage.SaveChanges(); } } } extractid++; nextextraturl(extractid); } // public void extract }