Esempio n. 1
0
        //查询百度百科词条时需跳转一次才能到达词条页面
        public static void GetBaidu(BaikeEntry baidu)
        {
            string url     = "http://baike.baidu.com/searchword/?word=" + HttpUtility.UrlEncode(baidu.entryName, GB18030) + "&pic=1";
            string content = GetUrlHTML(url, GB18030);

            if (content.Length != 0 && !content.Contains("210.77.16.29"))
            {
                //提取词条的真实url
                string pattern = @"(?is)view.*?htm";
                Match m = Regex.Match(content, pattern);
                if (m.Success)
                {
                    url = "http://baike.baidu.com/" + m.ToString();
                    baidu.url = url;
                    baidu.sourceHTML = GetUrlHTML(url, GB18030);    //百度百科网页编码为GB2312
                    if (baidu.sourceHTML.Length != 0)
                        baidu.errExist= false;
                    else
                        baidu.errMsg = "百度百科词条页面读取失败!";
                }
                else
                    baidu.errMsg = "百度百科中不存在该词条!";
            }
            else
                baidu.errMsg = "不能访问百度百科,请检查网络!";
        }
        private void buttonCompare_Click(object sender, EventArgs e)
        {
            string termName = Regex.Replace(textBoxTermName.Text, @"(?is)\s*", "");

            if (termName.Length == 0)
                MessageBox.Show("请输入词条名称", "提示");
            else
            {
                AllTextReset();   //清除上一次查询的信息
                BaikeEntry baidu  = new BaikeEntry("baidu", termName);
                BaikeEntry hudong = new BaikeEntry("hudong", termName);

                GetBaiduDetails(baidu);
                GetHudongDetails(hudong);

                if (!baidu.errExist && !hudong.errExist)
                    ComputeSimilitude(baidu, hudong);

                if (checkBoxPreview.Checked)
                {
                    webBrowserBaidu.DocumentText  = baidu.preview;
                    webBrowserHudong.DocumentText = hudong.preview;
                }

                /*********************  测试  *****************************/
                //richTextBoxBaiduText.Text  = baidu.preview;
                //richTextBoxHudongText.Text = hudong.preview;
            }
        }
Esempio n. 3
0
        public static void GetHudong(BaikeEntry hudong)
        {
            string url = "http://www.hudong.com/wiki/" + HttpUtility.UrlEncode(hudong.entryName, UTF8);

            hudong.url        = url;
            hudong.sourceHTML = GetUrlHTML(url, UTF8);  //互动百科网页编码为UTF8

            if (hudong.sourceHTML.Length != 0 && !hudong.sourceHTML.Contains("210.77.16.29"))
            {
                bool entryNotExist = hudong.sourceHTML.Contains("您要访问的页面不存在") ||
                                     hudong.sourceHTML.Contains("尚未收录词条") ||
                                     hudong.sourceHTML.Contains("词条名字为空");

                if (!entryNotExist)
                {
                    hudong.errExist = false;
                }
                else
                {
                    hudong.errMsg = "互动百科中不存在该词条!";
                }
            }
            else
            {
                hudong.errMsg = "不能访问互动百科,请检查网络!";
            }
        }
Esempio n. 4
0
        //查询百度百科词条时需跳转一次才能到达词条页面
        public static void GetBaidu(BaikeEntry baidu)
        {
            string url     = "http://baike.baidu.com/searchword/?word=" + HttpUtility.UrlEncode(baidu.entryName, GB18030) + "&pic=1";
            string content = GetUrlHTML(url, GB18030);

            if (content.Length != 0 && !content.Contains("210.77.16.29"))
            {
                //提取词条的真实url
                string pattern = @"(?is)view.*?htm";
                Match  m       = Regex.Match(content, pattern);
                if (m.Success)
                {
                    url              = "http://baike.baidu.com/" + m.ToString();
                    baidu.url        = url;
                    baidu.sourceHTML = GetUrlHTML(url, GB18030);    //百度百科网页编码为GB2312
                    if (baidu.sourceHTML.Length != 0)
                    {
                        baidu.errExist = false;
                    }
                    else
                    {
                        baidu.errMsg = "百度百科词条页面读取失败!";
                    }
                }
                else
                {
                    baidu.errMsg = "百度百科中不存在该词条!";
                }
            }
            else
            {
                baidu.errMsg = "不能访问百度百科,请检查网络!";
            }
        }
        private void GetBaiduDetails(BaikeEntry baidu)
        {
            /************************************************************************/
            /* 1.搜索百科词条
            /*    errExist:此处用来判断词条是否存在,SourceHTML会修改其值
            /************************************************************************/
            SourceHTML.GetBaidu(baidu);
            if (baidu.errExist)
            {
                textBoxBaidu.Text = baidu.errMsg;
                return;
            }
            textBoxBaidu.Text = baidu.url;
            baidu.errExist    = true;

            /************************************************************************/
            /* 2.提取网页正文
             *    errExist:此处用来判断是否正确提取到正文
            /************************************************************************/
            TextExtract baiduText = new TextExtract(baidu);
            if (baidu.errExist)
            {
                textBoxBaidu.Text = baidu.url + " (" + baidu.errMsg + ")";
                return;
            }
            richTextBoxBaiduText.Text = baidu.text;
            labelBaiduTextNum.Text    = baidu.text.Length.ToString();

            /************************************************************************/
            /* 3.分词                                                                */
            /************************************************************************/
            TermFrequence baiduTermFreq   = new TermFrequence(baidu);
            richTextBoxBaiduTermFreq.Text = baidu.allWordFreq;
            labelBaiduWordNum.Text        = baidu.wordDic.Count.ToString();
        }
        private void buttonCompare_Click(object sender, EventArgs e)
        {
            string termName = Regex.Replace(textBoxTermName.Text, @"(?is)\s*", "");

            if (termName.Length == 0)
            {
                MessageBox.Show("请输入词条名称", "提示");
            }
            else
            {
                AllTextReset();   //清除上一次查询的信息
                BaikeEntry baidu  = new BaikeEntry("baidu", termName);
                BaikeEntry hudong = new BaikeEntry("hudong", termName);

                GetBaiduDetails(baidu);
                GetHudongDetails(hudong);

                if (!baidu.errExist && !hudong.errExist)
                {
                    ComputeSimilitude(baidu, hudong);
                }

                if (checkBoxPreview.Checked)
                {
                    webBrowserBaidu.DocumentText  = baidu.preview;
                    webBrowserHudong.DocumentText = hudong.preview;
                }


                /*********************  测试  *****************************/
                //richTextBoxBaiduText.Text  = baidu.preview;
                //richTextBoxHudongText.Text = hudong.preview;
            }
        }
        private void ComputeSimilitude(BaikeEntry baidu, BaikeEntry hudong)
        {
            SimilitudeVSM simCos = new SimilitudeVSM(baidu, hudong);

            labelSimilitude.Text  = simCos.similitude.ToString("F5");
            labelSameWordNum.Text = baidu.sameWordNum.ToString();
        }
Esempio n. 8
0
        public TermFrequence(BaikeEntry newBaikeEntry)
        {
            baikeEntry = newBaikeEntry;

            GetTermDic();
            RemoveStopWords();
            GetTermList();  //只保留词频大于1且词项字数大于1的词项
            UpdateTermDic();
            GetTermShow();
        }
Esempio n. 9
0
        // 构造函数
        public TextExtract(BaikeEntry newBaikeEntry)
        {
            baikeEntry = newBaikeEntry;
            textStart  = 0;
            textEnd    = 0;
            textBody   = "";
            blockLen   = new List <int>();

            extract();
        }
Esempio n. 10
0
        private string[] stopWords; //停用词表

        #endregion Fields

        #region Constructors

        public TermFrequence(BaikeEntry newBaikeEntry)
        {
            baikeEntry = newBaikeEntry;

            GetTermDic();
            RemoveStopWords();
            GetTermList();  //只保留词频大于1且词项字数大于1的词项
            UpdateTermDic();
            GetTermShow();
        }
Esempio n. 11
0
        private int textStart; // 网页正文开始行数

        #endregion Fields

        #region Constructors

        // 构造函数
        public TextExtract(BaikeEntry newBaikeEntry)
        {
            baikeEntry = newBaikeEntry;
            textStart  = 0;
            textEnd    = 0;
            textBody   = "";
            blockLen   = new List<int>();

            extract();
        }
Esempio n. 12
0
        public SimilitudeVSM(BaikeEntry A, BaikeEntry B)
        {
            entryA       = A;
            entryB       = B;
            innerProduct = 0;
            normA        = 0.0;
            normB        = 0.0;
            similitude   = 0.0;

            ComputeCosine();
        }
Esempio n. 13
0
        public SimilitudeVSM(BaikeEntry A, BaikeEntry B)
        {
            entryA       = A;
            entryB       = B;
            innerProduct = 0;
            normA        = 0.0;
            normB        = 0.0;
            similitude   = 0.0;

            ComputeCosine();
        }
Esempio n. 14
0
        public static void GetHudong(BaikeEntry hudong)
        {
            string url        = "http://www.hudong.com/wiki/" + HttpUtility.UrlEncode(hudong.entryName, UTF8);
            hudong.url        = url;
            hudong.sourceHTML = GetUrlHTML(url, UTF8);  //互动百科网页编码为UTF8

            if (hudong.sourceHTML.Length != 0 && !hudong.sourceHTML.Contains("210.77.16.29"))
            {
                bool entryNotExist = hudong.sourceHTML.Contains("您要访问的页面不存在")
                    || hudong.sourceHTML.Contains("尚未收录词条")
                    || hudong.sourceHTML.Contains("词条名字为空");

                if (!entryNotExist)
                    hudong.errExist = false;
                else
                    hudong.errMsg = "互动百科中不存在该词条!";
            }
            else
                hudong.errMsg = "不能访问互动百科,请检查网络!";
        }
Esempio n. 15
0
        private void GetBaiduDetails(BaikeEntry baidu)
        {
            /************************************************************************/

            /* 1.搜索百科词条
             * /*    errExist:此处用来判断词条是否存在,SourceHTML会修改其值
             * /************************************************************************/
            SourceHTML.GetBaidu(baidu);
            if (baidu.errExist)
            {
                textBoxBaidu.Text = baidu.errMsg;
                return;
            }
            textBoxBaidu.Text = baidu.url;
            baidu.errExist    = true;

            /************************************************************************/

            /* 2.提取网页正文
             *    errExist:此处用来判断是否正确提取到正文
             * /************************************************************************/
            TextExtract baiduText = new TextExtract(baidu);

            if (baidu.errExist)
            {
                textBoxBaidu.Text = baidu.url + " (" + baidu.errMsg + ")";
                return;
            }
            richTextBoxBaiduText.Text = baidu.text;
            labelBaiduTextNum.Text    = baidu.text.Length.ToString();

            /************************************************************************/
            /* 3.分词                                                                */
            /************************************************************************/
            TermFrequence baiduTermFreq = new TermFrequence(baidu);

            richTextBoxBaiduTermFreq.Text = baidu.allWordFreq;
            labelBaiduWordNum.Text        = baidu.wordDic.Count.ToString();
        }
Esempio n. 16
0
        private void GetHudongDetails(BaikeEntry hudong)
        {
            /************************************************************************/

            /* 1.搜索百科词条
             *    errExist:此处用来判断词条是否存在,SourceHTML会修改其值
             * /************************************************************************/
            SourceHTML.GetHudong(hudong);
            if (hudong.errExist)
            {
                textBoxHudong.Text = hudong.errMsg;
                return;
            }
            textBoxHudong.Text = hudong.url;
            hudong.errExist    = true;

            /************************************************************************/

            /* 2.提取网页正文
             *    errExist:此处用来判断是否正确提取到正文
             * /************************************************************************/
            TextExtract HudongText = new TextExtract(hudong);

            if (hudong.errExist)
            {
                textBoxHudong.Text = hudong.url + " (" + hudong.errMsg + ")";
                return;
            }
            richTextBoxHudongText.Text = hudong.text;
            labelHudongTextNum.Text    = hudong.text.Length.ToString();

            /************************************************************************/
            /* 3.分词                                                                */
            /************************************************************************/
            TermFrequence HudongTermFreq = new TermFrequence(hudong);

            richTextBoxHudongTermFreq.Text = hudong.allWordFreq;
            labelHudongWordNum.Text        = hudong.wordDic.Count.ToString();
        }
        private void GetHudongDetails(BaikeEntry hudong)
        {
            /************************************************************************/
            /* 1.搜索百科词条
             *    errExist:此处用来判断词条是否存在,SourceHTML会修改其值
            /************************************************************************/
            SourceHTML.GetHudong(hudong);
            if (hudong.errExist)
            {
                textBoxHudong.Text = hudong.errMsg;
                return;
            }
            textBoxHudong.Text = hudong.url;
            hudong.errExist    = true;

            /************************************************************************/
            /* 2.提取网页正文
             *    errExist:此处用来判断是否正确提取到正文
            /************************************************************************/
            TextExtract HudongText = new TextExtract(hudong);
            if (hudong.errExist)
            {
                textBoxHudong.Text = hudong.url + " (" + hudong.errMsg + ")";
                return;
            }
            richTextBoxHudongText.Text = hudong.text;
            labelHudongTextNum.Text    = hudong.text.Length.ToString();

            /************************************************************************/
            /* 3.分词                                                                */
            /************************************************************************/
            TermFrequence HudongTermFreq   = new TermFrequence(hudong);
            richTextBoxHudongTermFreq.Text = hudong.allWordFreq;
            labelHudongWordNum.Text        = hudong.wordDic.Count.ToString();
        }
 private void ComputeSimilitude(BaikeEntry baidu, BaikeEntry hudong)
 {
     SimilitudeVSM simCos  = new SimilitudeVSM(baidu, hudong);
     labelSimilitude.Text  = simCos.similitude.ToString("F5");
     labelSameWordNum.Text = baidu.sameWordNum.ToString();
 }