private void partition_timer_Tick(object sender, EventArgs e) { count3++; if (count3 >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取 { partition_timer.Stop(); Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码"); urltableid++; selecturl(); return; } if (count3 > 2 && this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive) { partition_timer.Stop(); Console.WriteLine(this.webBrowser1.Document.All.Count); Console.WriteLine(this.webBrowser1.Document.Body.All.Count); int i = 0; List <HtmlElement> candidateslist = new List <HtmlElement>(); // HtmlElementCollection elecollection = this.webBrowser1.Document.Body.All; if (this.webBrowser1.Document.Body.All.Count < 100) { selecturl(); //加载的节点数太少直接下一个; return; } foreach (HtmlElement ele in this.webBrowser1.Document.All) { if (ele.Children.Count == 0 && ele.TagName != "SCRIPT" && ele.TagName != "NOSCRIPT" && ele.TagName != "STYLE" && ele.InnerText != null) { //Console.WriteLine(ele.TagName); //Console.WriteLine(i+" "+ele.InnerText); if (Regex.Replace(ele.InnerText, "\\s+", "").Length > 20) //长度太小的直接不要,不需要做去脚本处理的原因为本身为叶子节点 { getTagandAttr(ele); candidateslist.Add(ele); i++; } } } Console.WriteLine("总共选出的节点数目" + i); //总共的后选节点数目; selector.SetCandidate(candidateslist); //重新设置候选节点,每次调用会清除之前的候选节点 //Console.WriteLine("当前样本 :" + sampleString); IEnumerable <KeyValuePair <HtmlElement, double> > dicSort = selector.select(sampleString, 0, false); //根据sampleString来计算概念分。设为false表示所有评分结果均写入数据库中 foreach (KeyValuePair <HtmlElement, double> kvp in dicSort) { try {//将结果存入数据库中 // Console.Write(kvp.Key + ":" + kvp.Value + "<br />"); String[] tag_attr = getTagandAttr(kvp.Key); scores scores_table = new scores(); scores_table.urlid = urltableid; scores_table.originalid = originalid; scores_table.site = scoresite; scores_table.mark = scoremark; scores_table.info = Regex.Replace(HtmlElementprocess.getElementString(kvp.Key), "\\s+", " "); scores_table.parent = tag_attr[0]; scores_table.current = tag_attr[1]; scores_table.firstchild = tag_attr[2]; scores_table.score = kvp.Value; scores_table.original_current = tag_attr[3]; //htmlpage.scores.Add(scores_table); // htmlpage.SaveChanges(); //Console.WriteLine("样本的id: "+originalid); // Console.WriteLine("样本: " + sampleString); } catch (Exception error) { Console.WriteLine(error.Data); } } selecturl(); } }
private void weight_timer_Tick(object sender, EventArgs e) { count4++; if (count4 >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取 { weight_timer.Stop(); Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码"); urltableid_weight++; next_weighturl(); return; } if (count4 > 2 && this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive) { weight_timer.Stop(); Console.WriteLine(this.webBrowser1.Document.All.Count); Console.WriteLine(this.webBrowser1.Document.Body.All.Count); int i = 0; List <HtmlElement> candidateslist = new List <HtmlElement>(); if (this.webBrowser1.Document.Body.All.Count < 100) { next_weighturl(); //加载的节点数太少直接下一个; return; } foreach (HtmlElement ele in this.webBrowser1.Document.All) { if (ele.Children.Count == 0 && ele.TagName != "SCRIPT" && ele.TagName != "NOSCRIPT" && ele.TagName != "STYLE" && ele.InnerText != null) { if (Regex.Replace(ele.InnerText, "\\s+", "").Length > 20) //长度太小的直接不要,不需要做去脚本处理的原因为本身为叶子节点 { candidateslist.Add(ele); i++; } } } Console.WriteLine("总共选出的节点数目" + i); //总共的后选节点数目; //重新设置候选节点,每次调用会清除之前的候选节点 for (int n = 0; n < 10; n++) { weightscorors[n].SetCandidate(candidateslist); IEnumerable <KeyValuePair <HtmlElement, double> > dicSort = weightscorors[n].select(6); //根据sampleString来计算概念分,取排名前6的作为抽取的节点。 foreach (KeyValuePair <HtmlElement, double> kvp in dicSort) { try {//将结果存入数据库中 // Console.Write(kvp.Key + ":" + kvp.Value + "<br />"); String[] tag_attr = getTagandAttr(kvp.Key); weight_info weight_table = new weight_info(); weight_table.urlid = urltableid_weight; weight_table.site = site_weight; weight_table.mark = mark_weight; weight_table.info = HtmlElementprocess.getElementString(kvp.Key); weight_table.parent = tag_attr[0]; weight_table.current = tag_attr[1]; weight_table.child = tag_attr[2]; weight_table.score = kvp.Value; weight_table.samplenumber = (n + 1) * 10; htmlpage.weight_info.Add(weight_table); htmlpage.SaveChanges(); // Console.WriteLine("样本的id: "+originalid); // Console.WriteLine("样本: " + sampleString); } catch (UriFormatException) { } } } next_weighturl(); } }
public IEnumerable <KeyValuePair <HtmlElement, double> > select(String sampletext, int num, bool limit) //sampletext为概念的来源,对candidate打分,num指返回排名前num的节点; { DateTime beforDT = System.DateTime.Now; Console.Write("开始计算评分"); int test = 0; Dictionary <String, int> sampleconcepts = Conceptprocess.getConcept(sampletext); List <HtmlElement> keylist = new List <HtmlElement>(); keylist.AddRange(candidates.Keys); foreach (HtmlElement key in keylist) { test++; Console.WriteLine("正在计算第" + test + "个"); double score = 0; //当前节点评分 double combinescore = 0; double parent_score = -1; //父节点评分; // double parent_combinescore = -1; HtmlElement current = key; //当前节点 HtmlElement parent = current.Parent; //父节点 String temp = Regex.Replace(key.InnerText.Trim(), "\\s+", " "); //当前节点文本 // temp = Regex.Replace(temp, @"\d* out of stars \d*", "") //除去干扰字符 double lenscore = candscortor.lenscore(temp, candscortor.getavglen()); combinescore = candscortor.computeScore(temp); //节点的组合分 // Console.Write("节点: "+ temp); // Console.WriteLine("长度得分"+lenscore); if (lenscore < 0.4 || combinescore < 1) //过滤掉一些明显长度分过低的节点,组合分过低的直接过滤 { candidates.Remove(key); } else { //计算概念分一定要样本在前,待评节点在后,因为需要更具样本生成待评价节点概念向量 Dictionary <String, int> currentconcepts = Conceptprocess.getConcept(temp); // Console.WriteLine("String: " + temp); // Console.WriteLine("概念词个数: "+ currentconcepts.Count); // combinescore = candscortor.computeScore(temp); score = combinescore + Conceptprocess.getConceptscore(sampleconcepts, currentconcepts); //当前节点评分 // Console.WriteLine("组合分1: " + combinescore); // Console.WriteLine("概念分1: " + Conceptprocess.getConceptscore(sampleconcepts, currentconcepts)); // Console.WriteLine("111111得分: " + score); String parent_temp = null; //如果存在父节点则找出父节点文本; if (parent != null) { //父节点字符串需要进行处理 parent_temp = Regex.Replace(HtmlElementprocess.getElementString(parent), "\\s+", " "); } if (parent_temp != null && parent_temp.Length > 0) //父节点评分 { if (candscortor.lenscore(parent_temp, candscortor.getavglen()) > 0.4) { if (parent_temp.Equals(temp)) { parent_score = score; } else { // Console.WriteLine("Stringfu: " + parent_temp); Dictionary <String, int> parentconcepts = Conceptprocess.getConcept(parent_temp); // Console.WriteLine("概念词个数: " + parentconcepts.Count); parent_score = candscortor.computeScore(parent_temp) + Conceptprocess.getConceptscore(sampleconcepts, parentconcepts); // Console.WriteLine("组合分2: " + candscortor.computeScore(temp)); // Console.WriteLine("概念分2: " + Conceptprocess.getConceptscore(sampleconcepts, parentconcepts)); // Console.WriteLine("2222222得分: " + parent_score); } } else { parent_score = -1; //如果父节点的长度变得太大则也不要 } } // Console.WriteLine("父节点得分: " + parent_score + "子节点得分: " + score); while (parent_score >= score) //如果父节点的评分高于子节点则往上嵌套查找; { // Console.WriteLine("父节点得分: " + parent_score+ "子节点得分: "+ score); current = parent; //当前节点为评分高的节点; temp = parent_temp; parent = current.Parent; score = parent_score; //跟新parent_score;初始时 parent_score = 0;parent_temp = null; parent_score = -1; parent_temp = null; if (parent != null) { parent_temp = Regex.Replace(HtmlElementprocess.getElementString(parent), "\\s+", " "); } if (parent_temp != null && parent_temp.Length > 0) //父节点评分 { if (candscortor.lenscore(parent_temp, candscortor.getavglen()) > 0.4) { if (parent_temp.Equals(temp)) { parent_score = score; } else { Dictionary <String, int> parentconcepts = Conceptprocess.getConcept(parent_temp); parent_score = candscortor.computeScore(parent_temp) + Conceptprocess.getConceptscore(sampleconcepts, parentconcepts); } } else { parent_score = -1; //如果父节点的长度变得太大则也不要 } } } //跳出while循环时为说明current记录的节点大于父节点的打分; Console.WriteLine("总分" + score); if (candidates.ContainsKey(current)) { candidates[current] = score; } else { candidates.Add(current, score); candidates.Remove(key); } } } if (limit == true) { var dicSort = (from objDic in candidates orderby objDic.Value descending select objDic).Take(num); DateTime afterDT = System.DateTime.Now; TimeSpan ts = afterDT.Subtract(beforDT); // Console.WriteLine("DateTime总共花费{0}ms.", ts.TotalMilliseconds); return(dicSort); } else { var dicSort = (from objDic in candidates orderby objDic.Value descending select objDic); DateTime afterDT = System.DateTime.Now; TimeSpan ts = afterDT.Subtract(beforDT); // Console.WriteLine("DateTime总共花费{0}ms.", ts.TotalMilliseconds); return(dicSort); } }