Beispiel #1
0
 private void partition_timer_Tick(object sender, EventArgs e)
 {
     count3++;
     if (count3 >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取
     {
         partition_timer.Stop();
         Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码");
         urltableid++;
         selecturl();
         return;
     }
     if (count3 > 2 && this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive)
     {
         partition_timer.Stop();
         Console.WriteLine(this.webBrowser1.Document.All.Count);
         Console.WriteLine(this.webBrowser1.Document.Body.All.Count);
         int i = 0;
         List <HtmlElement> candidateslist = new List <HtmlElement>();
         //   HtmlElementCollection elecollection = this.webBrowser1.Document.Body.All;
         if (this.webBrowser1.Document.Body.All.Count < 100)
         {
             selecturl();   //加载的节点数太少直接下一个;
             return;
         }
         foreach (HtmlElement ele in this.webBrowser1.Document.All)
         {
             if (ele.Children.Count == 0 && ele.TagName != "SCRIPT" && ele.TagName != "NOSCRIPT" && ele.TagName != "STYLE" && ele.InnerText != null)
             {
                 //Console.WriteLine(ele.TagName);
                 //Console.WriteLine(i+"   "+ele.InnerText);
                 if (Regex.Replace(ele.InnerText, "\\s+", "").Length > 20) //长度太小的直接不要,不需要做去脚本处理的原因为本身为叶子节点
                 {
                     getTagandAttr(ele);
                     candidateslist.Add(ele);
                     i++;
                 }
             }
         }
         Console.WriteLine("总共选出的节点数目" + i);                                                                  //总共的后选节点数目;
         selector.SetCandidate(candidateslist);                                                               //重新设置候选节点,每次调用会清除之前的候选节点
         //Console.WriteLine("当前样本 :" + sampleString);
         IEnumerable <KeyValuePair <HtmlElement, double> > dicSort = selector.select(sampleString, 0, false); //根据sampleString来计算概念分。设为false表示所有评分结果均写入数据库中
         foreach (KeyValuePair <HtmlElement, double> kvp in dicSort)
         {
             try
             {//将结果存入数据库中
              // Console.Write(kvp.Key + ":" + kvp.Value + "<br />");
                 String[] tag_attr     = getTagandAttr(kvp.Key);
                 scores   scores_table = new scores();
                 scores_table.urlid            = urltableid;
                 scores_table.originalid       = originalid;
                 scores_table.site             = scoresite;
                 scores_table.mark             = scoremark;
                 scores_table.info             = Regex.Replace(HtmlElementprocess.getElementString(kvp.Key), "\\s+", " ");
                 scores_table.parent           = tag_attr[0];
                 scores_table.current          = tag_attr[1];
                 scores_table.firstchild       = tag_attr[2];
                 scores_table.score            = kvp.Value;
                 scores_table.original_current = tag_attr[3];
                 //htmlpage.scores.Add(scores_table);
                 //  htmlpage.SaveChanges();
                 //Console.WriteLine("样本的id: "+originalid);
                 //  Console.WriteLine("样本: " + sampleString);
             }
             catch (Exception error)
             {
                 Console.WriteLine(error.Data);
             }
         }
         selecturl();
     }
 }
Beispiel #2
0
        private void weight_timer_Tick(object sender, EventArgs e)
        {
            count4++;
            if (count4 >= 35) //count >=35仍然没有加载完成则直接取下一进行抽取
            {
                weight_timer.Stop();
                Console.Out.WriteLine("此处记得添加解析不成功是的下一页代码");
                urltableid_weight++;
                next_weighturl();
                return;
            }
            if (count4 > 2 && this.webBrowser1.ReadyState == WebBrowserReadyState.Complete || this.webBrowser1.ReadyState == WebBrowserReadyState.Interactive)
            {
                weight_timer.Stop();
                Console.WriteLine(this.webBrowser1.Document.All.Count);
                Console.WriteLine(this.webBrowser1.Document.Body.All.Count);
                int i = 0;
                List <HtmlElement> candidateslist = new List <HtmlElement>();
                if (this.webBrowser1.Document.Body.All.Count < 100)
                {
                    next_weighturl();   //加载的节点数太少直接下一个;
                    return;
                }
                foreach (HtmlElement ele in this.webBrowser1.Document.All)
                {
                    if (ele.Children.Count == 0 && ele.TagName != "SCRIPT" && ele.TagName != "NOSCRIPT" && ele.TagName != "STYLE" && ele.InnerText != null)
                    {
                        if (Regex.Replace(ele.InnerText, "\\s+", "").Length > 20) //长度太小的直接不要,不需要做去脚本处理的原因为本身为叶子节点
                        {
                            candidateslist.Add(ele);
                            i++;
                        }
                    }
                }
                Console.WriteLine("总共选出的节点数目" + i); //总共的后选节点数目;
                //重新设置候选节点,每次调用会清除之前的候选节点
                for (int n = 0; n < 10; n++)
                {
                    weightscorors[n].SetCandidate(candidateslist);
                    IEnumerable <KeyValuePair <HtmlElement, double> > dicSort = weightscorors[n].select(6);  //根据sampleString来计算概念分,取排名前6的作为抽取的节点。
                    foreach (KeyValuePair <HtmlElement, double> kvp in dicSort)
                    {
                        try
                        {//将结果存入数据库中
                         // Console.Write(kvp.Key + ":" + kvp.Value + "<br />");
                            String[]    tag_attr     = getTagandAttr(kvp.Key);
                            weight_info weight_table = new weight_info();

                            weight_table.urlid = urltableid_weight;

                            weight_table.site         = site_weight;
                            weight_table.mark         = mark_weight;
                            weight_table.info         = HtmlElementprocess.getElementString(kvp.Key);
                            weight_table.parent       = tag_attr[0];
                            weight_table.current      = tag_attr[1];
                            weight_table.child        = tag_attr[2];
                            weight_table.score        = kvp.Value;
                            weight_table.samplenumber = (n + 1) * 10;
                            htmlpage.weight_info.Add(weight_table);
                            htmlpage.SaveChanges();
                            //  Console.WriteLine("样本的id: "+originalid);
                            //  Console.WriteLine("样本: " + sampleString);
                        }
                        catch (UriFormatException)
                        {
                        }
                    }
                }
                next_weighturl();
            }
        }
        public IEnumerable <KeyValuePair <HtmlElement, double> > select(String sampletext, int num, bool limit) //sampletext为概念的来源,对candidate打分,num指返回排名前num的节点;
        {
            DateTime beforDT = System.DateTime.Now;

            Console.Write("开始计算评分");
            int test = 0;
            Dictionary <String, int> sampleconcepts = Conceptprocess.getConcept(sampletext);
            List <HtmlElement>       keylist        = new List <HtmlElement>();

            keylist.AddRange(candidates.Keys);
            foreach (HtmlElement key in keylist)
            {
                test++;
                Console.WriteLine("正在计算第" + test + "个");
                double score        = 0;                                                //当前节点评分
                double combinescore = 0;
                double parent_score = -1;                                               //父节点评分;
                //   double parent_combinescore = -1;
                HtmlElement current = key;                                              //当前节点
                HtmlElement parent  = current.Parent;                                   //父节点
                String      temp    = Regex.Replace(key.InnerText.Trim(), "\\s+", " "); //当前节点文本
                // temp = Regex.Replace(temp, @"\d* out of stars \d*", "") //除去干扰字符
                double lenscore = candscortor.lenscore(temp, candscortor.getavglen());
                combinescore = candscortor.computeScore(temp); //节点的组合分
                //   Console.Write("节点:  "+ temp);
                //    Console.WriteLine("长度得分"+lenscore);
                if (lenscore < 0.4 || combinescore < 1) //过滤掉一些明显长度分过低的节点,组合分过低的直接过滤
                {
                    candidates.Remove(key);
                }
                else
                { //计算概念分一定要样本在前,待评节点在后,因为需要更具样本生成待评价节点概念向量
                    Dictionary <String, int> currentconcepts = Conceptprocess.getConcept(temp);
                    //  Console.WriteLine("String:  " + temp);
                    //   Console.WriteLine("概念词个数:  "+ currentconcepts.Count);
                    // combinescore = candscortor.computeScore(temp);

                    score = combinescore + Conceptprocess.getConceptscore(sampleconcepts, currentconcepts);    //当前节点评分
                    // Console.WriteLine("组合分1:  " + combinescore);
                    //    Console.WriteLine("概念分1:  " + Conceptprocess.getConceptscore(sampleconcepts, currentconcepts));
                    //     Console.WriteLine("111111得分:  " + score);
                    String parent_temp = null; //如果存在父节点则找出父节点文本;
                    if (parent != null)
                    {
                        //父节点字符串需要进行处理
                        parent_temp = Regex.Replace(HtmlElementprocess.getElementString(parent), "\\s+", " ");
                    }

                    if (parent_temp != null && parent_temp.Length > 0) //父节点评分
                    {
                        if (candscortor.lenscore(parent_temp, candscortor.getavglen()) > 0.4)
                        {
                            if (parent_temp.Equals(temp))
                            {
                                parent_score = score;
                            }
                            else
                            {
                                //   Console.WriteLine("Stringfu:  " + parent_temp);
                                Dictionary <String, int> parentconcepts = Conceptprocess.getConcept(parent_temp);
                                //   Console.WriteLine("概念词个数:  " + parentconcepts.Count);
                                parent_score = candscortor.computeScore(parent_temp) + Conceptprocess.getConceptscore(sampleconcepts, parentconcepts);
                                //    Console.WriteLine("组合分2:  " + candscortor.computeScore(temp));
                                //    Console.WriteLine("概念分2:  " + Conceptprocess.getConceptscore(sampleconcepts, parentconcepts));
                                //    Console.WriteLine("2222222得分:  " + parent_score);
                            }
                        }
                        else
                        {
                            parent_score = -1;  //如果父节点的长度变得太大则也不要
                        }
                    }
                    // Console.WriteLine("父节点得分:  " + parent_score + "子节点得分:  " + score);
                    while (parent_score >= score)  //如果父节点的评分高于子节点则往上嵌套查找;
                    {
                        // Console.WriteLine("父节点得分:  " + parent_score+ "子节点得分:  "+ score);
                        current = parent;  //当前节点为评分高的节点;
                        temp    = parent_temp;
                        parent  = current.Parent;
                        score   = parent_score;
                        //跟新parent_score;初始时 parent_score = 0;parent_temp = null;
                        parent_score = -1;
                        parent_temp  = null;
                        if (parent != null)
                        {
                            parent_temp = Regex.Replace(HtmlElementprocess.getElementString(parent), "\\s+", " ");
                        }
                        if (parent_temp != null && parent_temp.Length > 0) //父节点评分
                        {
                            if (candscortor.lenscore(parent_temp, candscortor.getavglen()) > 0.4)
                            {
                                if (parent_temp.Equals(temp))
                                {
                                    parent_score = score;
                                }
                                else
                                {
                                    Dictionary <String, int> parentconcepts = Conceptprocess.getConcept(parent_temp);
                                    parent_score = candscortor.computeScore(parent_temp) + Conceptprocess.getConceptscore(sampleconcepts, parentconcepts);
                                }
                            }
                            else
                            {
                                parent_score = -1;  //如果父节点的长度变得太大则也不要
                            }
                        }
                    } //跳出while循环时为说明current记录的节点大于父节点的打分;
                    Console.WriteLine("总分" + score);
                    if (candidates.ContainsKey(current))
                    {
                        candidates[current] = score;
                    }
                    else
                    {
                        candidates.Add(current, score);
                        candidates.Remove(key);
                    }
                }
            }
            if (limit == true)
            {
                var      dicSort = (from objDic in candidates orderby objDic.Value descending select objDic).Take(num);
                DateTime afterDT = System.DateTime.Now;
                TimeSpan ts      = afterDT.Subtract(beforDT);
                // Console.WriteLine("DateTime总共花费{0}ms.", ts.TotalMilliseconds);
                return(dicSort);
            }
            else
            {
                var      dicSort = (from objDic in candidates orderby objDic.Value descending select objDic);
                DateTime afterDT = System.DateTime.Now;
                TimeSpan ts      = afterDT.Subtract(beforDT);
                // Console.WriteLine("DateTime总共花费{0}ms.", ts.TotalMilliseconds);
                return(dicSort);
            }
        }