public void readdata() { Segment segment = new Segment(); ICollection <WordInfo> words; ICollection <WordInfo> re_words; SinaJSON item; SinaJSON re_item; List <string> ls = new List <string>(); string[] sp = Datas.ads_word.Split(','); int quan; int t; int i; double temp_quan; List <ju_word> te;//test Datas.auto_read.WaitOne(); for (; Datas.now_readnum < Datas.temp_data.Count; Datas.now_readnum++) { if (Datas.now_readnum == Datas.temp_data.Count) { Datas.auto_read.WaitOne(); } quan = 0; item = Datas.temp_data.ElementAt(Datas.now_readnum); for (i = 0; i < sp.Length; i++) { t = item.Text.IndexOf(sp[i]); if (t != -1) { quan++; } } if (quan < 4) { item.Text = Regex.Replace(item.Text, @"[a-zA-Z]+[:/]*[a-zA-Z\d\./]+", ""); for (i = item.Text.Length - 1; i > -1; i--) { if (item.Text[i] == 55356 || item.Text[i] == 55357) { item.Text = item.Text.Remove(i); } } words = segment.DoSegment(item.Text); ls.Clear(); if (words.Count != 0) { foreach (WordInfo wordInfo in words) { if (!ls.Contains(wordInfo.Word)) { ls.Add(wordInfo.Word); if (Datas.frequency[6].ContainsKey(wordInfo.Word)) { Datas.frequency[6][wordInfo.Word].now_nu++; Datas.frequency[6][wordInfo.Word].position.Add(Datas.now_time_count); } else { words temp = new words(wordInfo.Word, Datas.now_time_count); Datas.frequency[6].Add(wordInfo.Word, temp); } } } } Datas.total_data[6].Add(item); Datas.total_num++; Datas.now_time_count++; if (Datas.now_time_count >= Datas.now_time_max) { Datas.frequency[Datas.now_time_read % 6] = Datas.frequency[6]; Datas.total_data[Datas.now_time_read % 6] = Datas.total_data[6]; Datas.frequency[6] = new Dictionary <string, 微博舆论.words>(); Datas.total_data[6] = new List <SinaJSON>(); //if(Datas.now_time_read>0) //算基础权重 for (i = 0; i < Datas.total_data[Datas.now_time_read % 6].Count; i++)//~~ { string max; re_item = Datas.total_data[Datas.now_time_read % 6].ElementAt(i);//~~ re_words = segment.DoSegment(re_item.Text); //max = max_ocur(re_words,Datas.frequency[Datas.now_time_read % 6],i); if (re_words.Count != 0) { max = re_words.ElementAt(0).Word; foreach (WordInfo word in re_words) { if (Datas.frequency[Datas.now_time_read % 6].ContainsKey(word.Word)) { if (Datas.frequency[Datas.now_time_read % 6][max].now_nu < Datas.frequency[Datas.now_time_read % 6][word.Word].now_nu) { max = word.Word; } } } foreach (WordInfo temp in re_words) { temp_quan = (0.5 + 0.5 * Datas.frequency[Datas.now_time_read % 6][temp.Word].now_nu / Datas.frequency[Datas.now_time_read % 6][max].now_nu); Datas.frequency[Datas.now_time_read % 6][temp.Word].weight_quan += (temp_quan * (Datas.weight_word * re_item.Attitudes_count + (1 - Datas.weight_word) * re_item.Comments_count)); } } } //算bursty 待加用户权重 List <string> l_s = new List <string>(); List <int> l_i = new List <int>(); List <ju_word> l_julei; int j; if (Datas.frequency[(Datas.now_time_read + 5) % 6].Count > 0) { l_s.Clear(); //突发度大于0 l_i.Clear(); //在字典中的位置 for (i = 0; i < Datas.frequency[Datas.now_time_read % 6].Count; i++) { double wehit_cou = 0; if (Datas.frequency[(Datas.now_time_read + 5) % 6].ContainsKey(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key)) { wehit_cou = Datas.frequency[(Datas.now_time_read + 5) % 6][Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key].weight_quan; } Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty = (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.weight_quan - wehit_cou) / 2; if (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty > 0) { l_s.Add(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key); l_i.Add(i); } } //距离聚类 l_julei = new List <ju_word>(); for (i = 0; i < l_s.Count; i++) { ju_word ju_t = new ju_word(l_s.ElementAt(i), Datas.frequency[Datas.now_time_read % 6][l_s.ElementAt(i)].position); l_julei.Add(ju_t); } double f_z = 0; double ju_temp; double ju_min; double fengzi = 0; double fenmu = 0; int temp_count;//the nunmber of the same int ju_x = 0; int ju_y = 0; List <int> l_posi = new List <int>(); while (f_z < 15) { ju_min = 10000; ju_x = 0; ju_y = 0; List <int> l_temp_pos = new List <int>(); for (i = 0; i < l_julei.Count; i++) { for (j = i + 1; j < l_julei.Count; j++) { //zuo = l_julei.ElementAt(i).Split(','); //you = l_julei.ElementAt(j).Split(','); for (int x = 0; x < l_julei.ElementAt(i).word.Count; x++) { for (int y = 0; y < l_julei.ElementAt(j).word.Count; y++) { fengzi += (Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(i).word.ElementAt(x)].bursty * Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(j).word.ElementAt(y)].bursty); } } fengzi = fengzi / (l_julei.ElementAt(i).word.Count + l_julei.ElementAt(i).word.Count); temp_count = 0; for (int ju_i = 0; ju_i < l_julei.ElementAt(i).position.Count; ju_i++) { if (l_julei.ElementAt(j).position.Contains(l_julei.ElementAt(i).position.ElementAt(ju_i))) { temp_count++; l_temp_pos.Add(l_julei.ElementAt(i).position.ElementAt(ju_i)); } } fenmu = (double)temp_count / (double)Datas.now_time_max; ju_temp = fengzi / fenmu; if (ju_temp < ju_min) { ju_min = ju_temp; ju_x = i; ju_y = j; l_posi = new List <int>(l_temp_pos); } fengzi = 0; fenmu = 0; l_temp_pos.Clear(); } te = l_julei.OrderBy(ju_word => ju_word.word.Count).ToList(); } if (ju_x == 0 && ju_y == 0) { break; } //聚类一个 for (i = 0; i < l_julei.ElementAt(ju_y).word.Count; i++) { l_julei.ElementAt(ju_x).word.Add(l_julei.ElementAt(ju_y).word.ElementAt(i)); } l_julei.ElementAt(ju_x).position = new List <int>(l_posi); l_julei.RemoveAt(ju_y); f_z = ju_min; } List <ju_word> temp = new List <ju_word>(); for (i = 0; i < l_julei.Count; i++) { if (l_julei.ElementAt(i).word.Count > 1) { temp.Add(l_julei.ElementAt(i)); } } Datas.ju_list[Datas.now_time_read % 6] = temp.OrderByDescending(ju_word => ju_word.position.Count).ToList(); //this.Invoke(new updateListboxFreq(doUpdateListboxFreq), new object[] { Datas.ju_list }); if (!Datas.ProgressBarCompelet) { Datas.ProgressBarCompelet = true; } } Datas.now_time_count = 0; Datas.now_time_read++; } } } }
public void readdata() { Segment segment = new Segment(); ICollection<WordInfo> words; ICollection<WordInfo> re_words; SinaJSON item; SinaJSON re_item; List<string> ls = new List<string>(); string[] sp = Datas.ads_word.Split(','); int quan; int t; int i; double temp_quan; List<ju_word> te;//test Datas.auto_read.WaitOne(); for (; Datas.now_readnum < Datas.temp_data.Count; Datas.now_readnum++) { if (Datas.now_readnum == Datas.temp_data.Count) Datas.auto_read.WaitOne(); quan = 0; item = Datas.temp_data.ElementAt(Datas.now_readnum); for (i = 0; i < sp.Length; i++) { t = item.Text.IndexOf(sp[i]); if (t != -1) quan++; } if (quan < 4) { item.Text = Regex.Replace(item.Text, @"[a-zA-Z]+[:/]*[a-zA-Z\d\./]+", ""); for (i = item.Text.Length - 1; i > -1; i--) { if (item.Text[i] == 55356 || item.Text[i] == 55357) { item.Text = item.Text.Remove(i); } } words = segment.DoSegment(item.Text); ls.Clear(); if (words.Count != 0) { foreach (WordInfo wordInfo in words) { if (!ls.Contains(wordInfo.Word)) { ls.Add(wordInfo.Word); if (Datas.frequency[6].ContainsKey(wordInfo.Word)) { Datas.frequency[6][wordInfo.Word].now_nu++; Datas.frequency[6][wordInfo.Word].position.Add(Datas.now_time_count); } else { words temp = new words(wordInfo.Word, Datas.now_time_count); Datas.frequency[6].Add(wordInfo.Word, temp); } } } } Datas.total_data[6].Add(item); Datas.total_num++; Datas.now_time_count++; if (Datas.now_time_count >= Datas.now_time_max) { Datas.frequency[Datas.now_time_read % 6] = Datas.frequency[6]; Datas.total_data[Datas.now_time_read % 6] = Datas.total_data[6]; Datas.frequency[6] = new Dictionary<string, 微博舆论.words>(); Datas.total_data[6] = new List<SinaJSON>(); //if(Datas.now_time_read>0) //算基础权重 for (i = 0; i < Datas.total_data[Datas.now_time_read % 6].Count; i++)//~~ { string max; re_item = Datas.total_data[Datas.now_time_read % 6].ElementAt(i);//~~ re_words = segment.DoSegment(re_item.Text); //max = max_ocur(re_words,Datas.frequency[Datas.now_time_read % 6],i); if (re_words.Count != 0) { max = re_words.ElementAt(0).Word; foreach (WordInfo word in re_words) { if (Datas.frequency[Datas.now_time_read % 6].ContainsKey(word.Word)) { if (Datas.frequency[Datas.now_time_read % 6][max].now_nu < Datas.frequency[Datas.now_time_read % 6][word.Word].now_nu) max = word.Word; } } foreach (WordInfo temp in re_words) { temp_quan = (0.5 + 0.5 * Datas.frequency[Datas.now_time_read % 6][temp.Word].now_nu / Datas.frequency[Datas.now_time_read % 6][max].now_nu); Datas.frequency[Datas.now_time_read % 6][temp.Word].weight_quan += (temp_quan * (Datas.weight_word * re_item.Attitudes_count + (1 - Datas.weight_word) * re_item.Comments_count)); } } } //算bursty 待加用户权重 List<string> l_s = new List<string>(); List<int> l_i = new List<int>(); List<ju_word> l_julei; int j; if (Datas.frequency[(Datas.now_time_read + 5) % 6].Count > 0) { l_s.Clear();//突发度大于0 l_i.Clear();//在字典中的位置 for (i = 0; i < Datas.frequency[Datas.now_time_read % 6].Count; i++) { double wehit_cou = 0; if (Datas.frequency[(Datas.now_time_read + 5) % 6].ContainsKey(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key)) wehit_cou = Datas.frequency[(Datas.now_time_read + 5) % 6][Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key].weight_quan; Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty = (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.weight_quan - wehit_cou) / 2; if (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty > 0) { l_s.Add(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key); l_i.Add(i); } } //距离聚类 l_julei = new List<ju_word>(); for (i = 0; i < l_s.Count; i++) { ju_word ju_t = new ju_word(l_s.ElementAt(i), Datas.frequency[Datas.now_time_read % 6][l_s.ElementAt(i)].position); l_julei.Add(ju_t); } double f_z = 0; double ju_temp; double ju_min; double fengzi = 0; double fenmu = 0; int temp_count;//the nunmber of the same int ju_x = 0; int ju_y = 0; List<int> l_posi = new List<int>(); while (f_z < 15) { ju_min = 10000; ju_x = 0; ju_y = 0; List<int> l_temp_pos = new List<int>(); for (i = 0; i < l_julei.Count; i++) { for (j = i + 1; j < l_julei.Count; j++) { //zuo = l_julei.ElementAt(i).Split(','); //you = l_julei.ElementAt(j).Split(','); for (int x = 0; x < l_julei.ElementAt(i).word.Count; x++) { for (int y = 0; y < l_julei.ElementAt(j).word.Count; y++) { fengzi += (Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(i).word.ElementAt(x)].bursty * Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(j).word.ElementAt(y)].bursty); } } fengzi = fengzi / (l_julei.ElementAt(i).word.Count + l_julei.ElementAt(i).word.Count); temp_count = 0; for (int ju_i = 0; ju_i < l_julei.ElementAt(i).position.Count; ju_i++) { if (l_julei.ElementAt(j).position.Contains(l_julei.ElementAt(i).position.ElementAt(ju_i))) { temp_count++; l_temp_pos.Add(l_julei.ElementAt(i).position.ElementAt(ju_i)); } } fenmu = (double)temp_count / (double)Datas.now_time_max; ju_temp = fengzi / fenmu; if (ju_temp < ju_min) { ju_min = ju_temp; ju_x = i; ju_y = j; l_posi = new List<int>(l_temp_pos); } fengzi = 0; fenmu = 0; l_temp_pos.Clear(); } te = l_julei.OrderBy(ju_word => ju_word.word.Count).ToList(); } if (ju_x == 0 && ju_y == 0) break; //聚类一个 for (i = 0; i < l_julei.ElementAt(ju_y).word.Count; i++) l_julei.ElementAt(ju_x).word.Add(l_julei.ElementAt(ju_y).word.ElementAt(i)); l_julei.ElementAt(ju_x).position = new List<int>(l_posi); l_julei.RemoveAt(ju_y); f_z = ju_min; } List<ju_word> temp = new List<ju_word>(); for (i = 0; i < l_julei.Count; i++) if (l_julei.ElementAt(i).word.Count > 1) temp.Add(l_julei.ElementAt(i)); Datas.ju_list[Datas.now_time_read%6] = temp.OrderByDescending(ju_word => ju_word.position.Count).ToList(); //this.Invoke(new updateListboxFreq(doUpdateListboxFreq), new object[] { Datas.ju_list }); if (!Datas.ProgressBarCompelet) Datas.ProgressBarCompelet = true; } Datas.now_time_count = 0; Datas.now_time_read++; } } } }