//查询百度百科词条时需跳转一次才能到达词条页面 public static void GetBaidu(BaikeEntry baidu) { string url = "http://baike.baidu.com/searchword/?word=" + HttpUtility.UrlEncode(baidu.entryName, GB18030) + "&pic=1"; string content = GetUrlHTML(url, GB18030); if (content.Length != 0 && !content.Contains("210.77.16.29")) { //提取词条的真实url string pattern = @"(?is)view.*?htm"; Match m = Regex.Match(content, pattern); if (m.Success) { url = "http://baike.baidu.com/" + m.ToString(); baidu.url = url; baidu.sourceHTML = GetUrlHTML(url, GB18030); //百度百科网页编码为GB2312 if (baidu.sourceHTML.Length != 0) baidu.errExist= false; else baidu.errMsg = "百度百科词条页面读取失败!"; } else baidu.errMsg = "百度百科中不存在该词条!"; } else baidu.errMsg = "不能访问百度百科,请检查网络!"; }
private void buttonCompare_Click(object sender, EventArgs e) { string termName = Regex.Replace(textBoxTermName.Text, @"(?is)\s*", ""); if (termName.Length == 0) MessageBox.Show("请输入词条名称", "提示"); else { AllTextReset(); //清除上一次查询的信息 BaikeEntry baidu = new BaikeEntry("baidu", termName); BaikeEntry hudong = new BaikeEntry("hudong", termName); GetBaiduDetails(baidu); GetHudongDetails(hudong); if (!baidu.errExist && !hudong.errExist) ComputeSimilitude(baidu, hudong); if (checkBoxPreview.Checked) { webBrowserBaidu.DocumentText = baidu.preview; webBrowserHudong.DocumentText = hudong.preview; } /********************* 测试 *****************************/ //richTextBoxBaiduText.Text = baidu.preview; //richTextBoxHudongText.Text = hudong.preview; } }
public static void GetHudong(BaikeEntry hudong) { string url = "http://www.hudong.com/wiki/" + HttpUtility.UrlEncode(hudong.entryName, UTF8); hudong.url = url; hudong.sourceHTML = GetUrlHTML(url, UTF8); //互动百科网页编码为UTF8 if (hudong.sourceHTML.Length != 0 && !hudong.sourceHTML.Contains("210.77.16.29")) { bool entryNotExist = hudong.sourceHTML.Contains("您要访问的页面不存在") || hudong.sourceHTML.Contains("尚未收录词条") || hudong.sourceHTML.Contains("词条名字为空"); if (!entryNotExist) { hudong.errExist = false; } else { hudong.errMsg = "互动百科中不存在该词条!"; } } else { hudong.errMsg = "不能访问互动百科,请检查网络!"; } }
//查询百度百科词条时需跳转一次才能到达词条页面 public static void GetBaidu(BaikeEntry baidu) { string url = "http://baike.baidu.com/searchword/?word=" + HttpUtility.UrlEncode(baidu.entryName, GB18030) + "&pic=1"; string content = GetUrlHTML(url, GB18030); if (content.Length != 0 && !content.Contains("210.77.16.29")) { //提取词条的真实url string pattern = @"(?is)view.*?htm"; Match m = Regex.Match(content, pattern); if (m.Success) { url = "http://baike.baidu.com/" + m.ToString(); baidu.url = url; baidu.sourceHTML = GetUrlHTML(url, GB18030); //百度百科网页编码为GB2312 if (baidu.sourceHTML.Length != 0) { baidu.errExist = false; } else { baidu.errMsg = "百度百科词条页面读取失败!"; } } else { baidu.errMsg = "百度百科中不存在该词条!"; } } else { baidu.errMsg = "不能访问百度百科,请检查网络!"; } }
private void GetBaiduDetails(BaikeEntry baidu) { /************************************************************************/ /* 1.搜索百科词条 /* errExist:此处用来判断词条是否存在,SourceHTML会修改其值 /************************************************************************/ SourceHTML.GetBaidu(baidu); if (baidu.errExist) { textBoxBaidu.Text = baidu.errMsg; return; } textBoxBaidu.Text = baidu.url; baidu.errExist = true; /************************************************************************/ /* 2.提取网页正文 * errExist:此处用来判断是否正确提取到正文 /************************************************************************/ TextExtract baiduText = new TextExtract(baidu); if (baidu.errExist) { textBoxBaidu.Text = baidu.url + " (" + baidu.errMsg + ")"; return; } richTextBoxBaiduText.Text = baidu.text; labelBaiduTextNum.Text = baidu.text.Length.ToString(); /************************************************************************/ /* 3.分词 */ /************************************************************************/ TermFrequence baiduTermFreq = new TermFrequence(baidu); richTextBoxBaiduTermFreq.Text = baidu.allWordFreq; labelBaiduWordNum.Text = baidu.wordDic.Count.ToString(); }
private void buttonCompare_Click(object sender, EventArgs e) { string termName = Regex.Replace(textBoxTermName.Text, @"(?is)\s*", ""); if (termName.Length == 0) { MessageBox.Show("请输入词条名称", "提示"); } else { AllTextReset(); //清除上一次查询的信息 BaikeEntry baidu = new BaikeEntry("baidu", termName); BaikeEntry hudong = new BaikeEntry("hudong", termName); GetBaiduDetails(baidu); GetHudongDetails(hudong); if (!baidu.errExist && !hudong.errExist) { ComputeSimilitude(baidu, hudong); } if (checkBoxPreview.Checked) { webBrowserBaidu.DocumentText = baidu.preview; webBrowserHudong.DocumentText = hudong.preview; } /********************* 测试 *****************************/ //richTextBoxBaiduText.Text = baidu.preview; //richTextBoxHudongText.Text = hudong.preview; } }
private void ComputeSimilitude(BaikeEntry baidu, BaikeEntry hudong) { SimilitudeVSM simCos = new SimilitudeVSM(baidu, hudong); labelSimilitude.Text = simCos.similitude.ToString("F5"); labelSameWordNum.Text = baidu.sameWordNum.ToString(); }
public TermFrequence(BaikeEntry newBaikeEntry) { baikeEntry = newBaikeEntry; GetTermDic(); RemoveStopWords(); GetTermList(); //只保留词频大于1且词项字数大于1的词项 UpdateTermDic(); GetTermShow(); }
// 构造函数 public TextExtract(BaikeEntry newBaikeEntry) { baikeEntry = newBaikeEntry; textStart = 0; textEnd = 0; textBody = ""; blockLen = new List <int>(); extract(); }
private string[] stopWords; //停用词表 #endregion Fields #region Constructors public TermFrequence(BaikeEntry newBaikeEntry) { baikeEntry = newBaikeEntry; GetTermDic(); RemoveStopWords(); GetTermList(); //只保留词频大于1且词项字数大于1的词项 UpdateTermDic(); GetTermShow(); }
private int textStart; // 网页正文开始行数 #endregion Fields #region Constructors // 构造函数 public TextExtract(BaikeEntry newBaikeEntry) { baikeEntry = newBaikeEntry; textStart = 0; textEnd = 0; textBody = ""; blockLen = new List<int>(); extract(); }
public SimilitudeVSM(BaikeEntry A, BaikeEntry B) { entryA = A; entryB = B; innerProduct = 0; normA = 0.0; normB = 0.0; similitude = 0.0; ComputeCosine(); }
public static void GetHudong(BaikeEntry hudong) { string url = "http://www.hudong.com/wiki/" + HttpUtility.UrlEncode(hudong.entryName, UTF8); hudong.url = url; hudong.sourceHTML = GetUrlHTML(url, UTF8); //互动百科网页编码为UTF8 if (hudong.sourceHTML.Length != 0 && !hudong.sourceHTML.Contains("210.77.16.29")) { bool entryNotExist = hudong.sourceHTML.Contains("您要访问的页面不存在") || hudong.sourceHTML.Contains("尚未收录词条") || hudong.sourceHTML.Contains("词条名字为空"); if (!entryNotExist) hudong.errExist = false; else hudong.errMsg = "互动百科中不存在该词条!"; } else hudong.errMsg = "不能访问互动百科,请检查网络!"; }
private void GetBaiduDetails(BaikeEntry baidu) { /************************************************************************/ /* 1.搜索百科词条 * /* errExist:此处用来判断词条是否存在,SourceHTML会修改其值 * /************************************************************************/ SourceHTML.GetBaidu(baidu); if (baidu.errExist) { textBoxBaidu.Text = baidu.errMsg; return; } textBoxBaidu.Text = baidu.url; baidu.errExist = true; /************************************************************************/ /* 2.提取网页正文 * errExist:此处用来判断是否正确提取到正文 * /************************************************************************/ TextExtract baiduText = new TextExtract(baidu); if (baidu.errExist) { textBoxBaidu.Text = baidu.url + " (" + baidu.errMsg + ")"; return; } richTextBoxBaiduText.Text = baidu.text; labelBaiduTextNum.Text = baidu.text.Length.ToString(); /************************************************************************/ /* 3.分词 */ /************************************************************************/ TermFrequence baiduTermFreq = new TermFrequence(baidu); richTextBoxBaiduTermFreq.Text = baidu.allWordFreq; labelBaiduWordNum.Text = baidu.wordDic.Count.ToString(); }
private void GetHudongDetails(BaikeEntry hudong) { /************************************************************************/ /* 1.搜索百科词条 * errExist:此处用来判断词条是否存在,SourceHTML会修改其值 * /************************************************************************/ SourceHTML.GetHudong(hudong); if (hudong.errExist) { textBoxHudong.Text = hudong.errMsg; return; } textBoxHudong.Text = hudong.url; hudong.errExist = true; /************************************************************************/ /* 2.提取网页正文 * errExist:此处用来判断是否正确提取到正文 * /************************************************************************/ TextExtract HudongText = new TextExtract(hudong); if (hudong.errExist) { textBoxHudong.Text = hudong.url + " (" + hudong.errMsg + ")"; return; } richTextBoxHudongText.Text = hudong.text; labelHudongTextNum.Text = hudong.text.Length.ToString(); /************************************************************************/ /* 3.分词 */ /************************************************************************/ TermFrequence HudongTermFreq = new TermFrequence(hudong); richTextBoxHudongTermFreq.Text = hudong.allWordFreq; labelHudongWordNum.Text = hudong.wordDic.Count.ToString(); }
private void GetHudongDetails(BaikeEntry hudong) { /************************************************************************/ /* 1.搜索百科词条 * errExist:此处用来判断词条是否存在,SourceHTML会修改其值 /************************************************************************/ SourceHTML.GetHudong(hudong); if (hudong.errExist) { textBoxHudong.Text = hudong.errMsg; return; } textBoxHudong.Text = hudong.url; hudong.errExist = true; /************************************************************************/ /* 2.提取网页正文 * errExist:此处用来判断是否正确提取到正文 /************************************************************************/ TextExtract HudongText = new TextExtract(hudong); if (hudong.errExist) { textBoxHudong.Text = hudong.url + " (" + hudong.errMsg + ")"; return; } richTextBoxHudongText.Text = hudong.text; labelHudongTextNum.Text = hudong.text.Length.ToString(); /************************************************************************/ /* 3.分词 */ /************************************************************************/ TermFrequence HudongTermFreq = new TermFrequence(hudong); richTextBoxHudongTermFreq.Text = hudong.allWordFreq; labelHudongWordNum.Text = hudong.wordDic.Count.ToString(); }