Esempio n. 1
0
        public void readdata()
        {
            Segment segment = new Segment();
            ICollection <WordInfo> words;
            ICollection <WordInfo> re_words;
            SinaJSON      item;
            SinaJSON      re_item;
            List <string> ls = new List <string>();

            string[]       sp = Datas.ads_word.Split(',');
            int            quan;
            int            t;
            int            i;
            double         temp_quan;
            List <ju_word> te;//test

            Datas.auto_read.WaitOne();
            for (; Datas.now_readnum < Datas.temp_data.Count; Datas.now_readnum++)
            {
                if (Datas.now_readnum == Datas.temp_data.Count)
                {
                    Datas.auto_read.WaitOne();
                }
                quan = 0;
                item = Datas.temp_data.ElementAt(Datas.now_readnum);
                for (i = 0; i < sp.Length; i++)
                {
                    t = item.Text.IndexOf(sp[i]);
                    if (t != -1)
                    {
                        quan++;
                    }
                }
                if (quan < 4)
                {
                    item.Text = Regex.Replace(item.Text, @"[a-zA-Z]+[:/]*[a-zA-Z\d\./]+", "");

                    for (i = item.Text.Length - 1; i > -1; i--)
                    {
                        if (item.Text[i] == 55356 || item.Text[i] == 55357)
                        {
                            item.Text = item.Text.Remove(i);
                        }
                    }

                    words = segment.DoSegment(item.Text);
                    ls.Clear();
                    if (words.Count != 0)
                    {
                        foreach (WordInfo wordInfo in words)
                        {
                            if (!ls.Contains(wordInfo.Word))
                            {
                                ls.Add(wordInfo.Word);
                                if (Datas.frequency[6].ContainsKey(wordInfo.Word))
                                {
                                    Datas.frequency[6][wordInfo.Word].now_nu++;
                                    Datas.frequency[6][wordInfo.Word].position.Add(Datas.now_time_count);
                                }
                                else
                                {
                                    words temp = new words(wordInfo.Word, Datas.now_time_count);

                                    Datas.frequency[6].Add(wordInfo.Word, temp);
                                }
                            }
                        }
                    }
                    Datas.total_data[6].Add(item);
                    Datas.total_num++;
                    Datas.now_time_count++;
                    if (Datas.now_time_count >= Datas.now_time_max)
                    {
                        Datas.frequency[Datas.now_time_read % 6]  = Datas.frequency[6];
                        Datas.total_data[Datas.now_time_read % 6] = Datas.total_data[6];
                        Datas.frequency[6]  = new Dictionary <string, 微博舆论.words>();
                        Datas.total_data[6] = new List <SinaJSON>();
                        //if(Datas.now_time_read>0)
                        //算基础权重
                        for (i = 0; i < Datas.total_data[Datas.now_time_read % 6].Count; i++)//~~
                        {
                            string max;
                            re_item  = Datas.total_data[Datas.now_time_read % 6].ElementAt(i);//~~
                            re_words = segment.DoSegment(re_item.Text);
                            //max = max_ocur(re_words,Datas.frequency[Datas.now_time_read % 6],i);
                            if (re_words.Count != 0)
                            {
                                max = re_words.ElementAt(0).Word;
                                foreach (WordInfo word in re_words)
                                {
                                    if (Datas.frequency[Datas.now_time_read % 6].ContainsKey(word.Word))
                                    {
                                        if (Datas.frequency[Datas.now_time_read % 6][max].now_nu < Datas.frequency[Datas.now_time_read % 6][word.Word].now_nu)
                                        {
                                            max = word.Word;
                                        }
                                    }
                                }
                                foreach (WordInfo temp in re_words)
                                {
                                    temp_quan = (0.5 + 0.5 * Datas.frequency[Datas.now_time_read % 6][temp.Word].now_nu / Datas.frequency[Datas.now_time_read % 6][max].now_nu);
                                    Datas.frequency[Datas.now_time_read % 6][temp.Word].weight_quan += (temp_quan * (Datas.weight_word * re_item.Attitudes_count + (1 - Datas.weight_word) * re_item.Comments_count));
                                }
                            }
                        }
                        //算bursty  待加用户权重
                        List <string>  l_s = new List <string>();
                        List <int>     l_i = new List <int>();
                        List <ju_word> l_julei;

                        int j;
                        if (Datas.frequency[(Datas.now_time_read + 5) % 6].Count > 0)
                        {
                            l_s.Clear(); //突发度大于0
                            l_i.Clear(); //在字典中的位置
                            for (i = 0; i < Datas.frequency[Datas.now_time_read % 6].Count; i++)
                            {
                                double wehit_cou = 0;
                                if (Datas.frequency[(Datas.now_time_read + 5) % 6].ContainsKey(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key))
                                {
                                    wehit_cou = Datas.frequency[(Datas.now_time_read + 5) % 6][Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key].weight_quan;
                                }
                                Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty =
                                    (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.weight_quan - wehit_cou) / 2;
                                if (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty > 0)
                                {
                                    l_s.Add(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key);
                                    l_i.Add(i);
                                }
                            }
                            //距离聚类

                            l_julei = new List <ju_word>();
                            for (i = 0; i < l_s.Count; i++)
                            {
                                ju_word ju_t = new ju_word(l_s.ElementAt(i), Datas.frequency[Datas.now_time_read % 6][l_s.ElementAt(i)].position);
                                l_julei.Add(ju_t);
                            }

                            double     f_z = 0;
                            double     ju_temp;
                            double     ju_min;
                            double     fengzi = 0;
                            double     fenmu  = 0;
                            int        temp_count;//the nunmber of the same
                            int        ju_x   = 0;
                            int        ju_y   = 0;
                            List <int> l_posi = new List <int>();
                            while (f_z < 15)
                            {
                                ju_min = 10000;
                                ju_x   = 0;
                                ju_y   = 0;
                                List <int> l_temp_pos = new List <int>();
                                for (i = 0; i < l_julei.Count; i++)
                                {
                                    for (j = i + 1; j < l_julei.Count; j++)
                                    {
                                        //zuo = l_julei.ElementAt(i).Split(',');
                                        //you = l_julei.ElementAt(j).Split(',');

                                        for (int x = 0; x < l_julei.ElementAt(i).word.Count; x++)
                                        {
                                            for (int y = 0; y < l_julei.ElementAt(j).word.Count; y++)
                                            {
                                                fengzi += (Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(i).word.ElementAt(x)].bursty
                                                           * Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(j).word.ElementAt(y)].bursty);
                                            }
                                        }
                                        fengzi     = fengzi / (l_julei.ElementAt(i).word.Count + l_julei.ElementAt(i).word.Count);
                                        temp_count = 0;
                                        for (int ju_i = 0; ju_i < l_julei.ElementAt(i).position.Count; ju_i++)
                                        {
                                            if (l_julei.ElementAt(j).position.Contains(l_julei.ElementAt(i).position.ElementAt(ju_i)))
                                            {
                                                temp_count++;
                                                l_temp_pos.Add(l_julei.ElementAt(i).position.ElementAt(ju_i));
                                            }
                                        }
                                        fenmu   = (double)temp_count / (double)Datas.now_time_max;
                                        ju_temp = fengzi / fenmu;
                                        if (ju_temp < ju_min)
                                        {
                                            ju_min = ju_temp;
                                            ju_x   = i;
                                            ju_y   = j;
                                            l_posi = new List <int>(l_temp_pos);
                                        }
                                        fengzi = 0;
                                        fenmu  = 0;
                                        l_temp_pos.Clear();
                                    }
                                    te = l_julei.OrderBy(ju_word => ju_word.word.Count).ToList();
                                }
                                if (ju_x == 0 && ju_y == 0)
                                {
                                    break;
                                }
                                //聚类一个
                                for (i = 0; i < l_julei.ElementAt(ju_y).word.Count; i++)
                                {
                                    l_julei.ElementAt(ju_x).word.Add(l_julei.ElementAt(ju_y).word.ElementAt(i));
                                }
                                l_julei.ElementAt(ju_x).position = new List <int>(l_posi);
                                l_julei.RemoveAt(ju_y);
                                f_z = ju_min;
                            }
                            List <ju_word> temp = new List <ju_word>();

                            for (i = 0; i < l_julei.Count; i++)
                            {
                                if (l_julei.ElementAt(i).word.Count > 1)
                                {
                                    temp.Add(l_julei.ElementAt(i));
                                }
                            }
                            Datas.ju_list[Datas.now_time_read % 6] = temp.OrderByDescending(ju_word => ju_word.position.Count).ToList();

                            //this.Invoke(new updateListboxFreq(doUpdateListboxFreq), new object[] { Datas.ju_list });
                            if (!Datas.ProgressBarCompelet)
                            {
                                Datas.ProgressBarCompelet = true;
                            }
                        }

                        Datas.now_time_count = 0;
                        Datas.now_time_read++;
                    }
                }
            }
        }
Esempio n. 2
0
        public void readdata()
        {
            Segment segment = new Segment();
            ICollection<WordInfo> words;
            ICollection<WordInfo> re_words;
            SinaJSON item;
            SinaJSON re_item;
            List<string> ls = new List<string>();
            string[] sp = Datas.ads_word.Split(',');
            int quan;
            int t;
            int i;
            double temp_quan;
            List<ju_word> te;//test

            Datas.auto_read.WaitOne();
            for (; Datas.now_readnum < Datas.temp_data.Count; Datas.now_readnum++)
            {
                if (Datas.now_readnum == Datas.temp_data.Count)
                    Datas.auto_read.WaitOne();
                quan = 0;
                item = Datas.temp_data.ElementAt(Datas.now_readnum);
                for (i = 0; i < sp.Length; i++)
                {
                    t = item.Text.IndexOf(sp[i]);
                    if (t != -1)
                        quan++;

                }
                if (quan < 4)
                {
                    item.Text = Regex.Replace(item.Text, @"[a-zA-Z]+[:/]*[a-zA-Z\d\./]+", "");

                    for (i = item.Text.Length - 1; i > -1; i--)
                    {
                        if (item.Text[i] == 55356 || item.Text[i] == 55357)
                        {
                            item.Text = item.Text.Remove(i);

                        }

                    }

                    words = segment.DoSegment(item.Text);
                    ls.Clear();
                    if (words.Count != 0)
                    {
                        foreach (WordInfo wordInfo in words)
                        {
                            if (!ls.Contains(wordInfo.Word))
                            {
                                ls.Add(wordInfo.Word);
                                if (Datas.frequency[6].ContainsKey(wordInfo.Word))
                                {
                                    Datas.frequency[6][wordInfo.Word].now_nu++;
                                    Datas.frequency[6][wordInfo.Word].position.Add(Datas.now_time_count);
                                }
                                else
                                {
                                    words temp = new words(wordInfo.Word, Datas.now_time_count);

                                    Datas.frequency[6].Add(wordInfo.Word, temp);
                                }
                            }
                        }
                    }
                    Datas.total_data[6].Add(item);
                    Datas.total_num++;
                    Datas.now_time_count++;
                    if (Datas.now_time_count >= Datas.now_time_max)
                    {
                        Datas.frequency[Datas.now_time_read % 6] = Datas.frequency[6];
                        Datas.total_data[Datas.now_time_read % 6] = Datas.total_data[6];
                        Datas.frequency[6] = new Dictionary<string, 微博舆论.words>();
                        Datas.total_data[6] = new List<SinaJSON>();
                        //if(Datas.now_time_read>0)
                        //算基础权重
                        for (i = 0; i < Datas.total_data[Datas.now_time_read % 6].Count; i++)//~~
                        {
                            string max;
                            re_item = Datas.total_data[Datas.now_time_read % 6].ElementAt(i);//~~
                            re_words = segment.DoSegment(re_item.Text);
                            //max = max_ocur(re_words,Datas.frequency[Datas.now_time_read % 6],i);
                            if (re_words.Count != 0)
                            {
                                max = re_words.ElementAt(0).Word;
                                foreach (WordInfo word in re_words)
                                {
                                    if (Datas.frequency[Datas.now_time_read % 6].ContainsKey(word.Word))
                                    {
                                        if (Datas.frequency[Datas.now_time_read % 6][max].now_nu < Datas.frequency[Datas.now_time_read % 6][word.Word].now_nu)
                                            max = word.Word;
                                    }
                                }
                                foreach (WordInfo temp in re_words)
                                {
                                    temp_quan = (0.5 + 0.5 * Datas.frequency[Datas.now_time_read % 6][temp.Word].now_nu / Datas.frequency[Datas.now_time_read % 6][max].now_nu);
                                    Datas.frequency[Datas.now_time_read % 6][temp.Word].weight_quan += (temp_quan * (Datas.weight_word * re_item.Attitudes_count + (1 - Datas.weight_word) * re_item.Comments_count));

                                }
                            }
                        }
                        //算bursty  待加用户权重
                        List<string> l_s = new List<string>();
                        List<int> l_i = new List<int>();
                        List<ju_word> l_julei;

                        int j;
                        if (Datas.frequency[(Datas.now_time_read + 5) % 6].Count > 0)
                        {
                            l_s.Clear();//突发度大于0
                            l_i.Clear();//在字典中的位置
                            for (i = 0; i < Datas.frequency[Datas.now_time_read % 6].Count; i++)
                            {
                                double wehit_cou = 0;
                                if (Datas.frequency[(Datas.now_time_read + 5) % 6].ContainsKey(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key))
                                    wehit_cou = Datas.frequency[(Datas.now_time_read + 5) % 6][Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key].weight_quan;
                                Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty =
                                    (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.weight_quan - wehit_cou) / 2;
                                if (Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Value.bursty > 0)
                                {
                                    l_s.Add(Datas.frequency[Datas.now_time_read % 6].ElementAt(i).Key);
                                    l_i.Add(i);
                                }

                            }
                            //距离聚类

                            l_julei = new List<ju_word>();
                            for (i = 0; i < l_s.Count; i++)
                            {
                                ju_word ju_t = new ju_word(l_s.ElementAt(i), Datas.frequency[Datas.now_time_read % 6][l_s.ElementAt(i)].position);
                                l_julei.Add(ju_t);
                            }

                            double f_z = 0;
                            double ju_temp;
                            double ju_min;
                            double fengzi = 0;
                            double fenmu = 0;
                            int temp_count;//the nunmber of the same
                            int ju_x = 0;
                            int ju_y = 0;
                            List<int> l_posi = new List<int>();
                            while (f_z < 15)
                            {
                                ju_min = 10000;
                                ju_x = 0;
                                ju_y = 0;
                                List<int> l_temp_pos = new List<int>();
                                for (i = 0; i < l_julei.Count; i++)
                                {
                                    for (j = i + 1; j < l_julei.Count; j++)
                                    {
                                        //zuo = l_julei.ElementAt(i).Split(',');
                                        //you = l_julei.ElementAt(j).Split(',');

                                        for (int x = 0; x < l_julei.ElementAt(i).word.Count; x++)
                                        {
                                            for (int y = 0; y < l_julei.ElementAt(j).word.Count; y++)
                                            {
                                                fengzi += (Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(i).word.ElementAt(x)].bursty
                                                    * Datas.frequency[Datas.now_time_read % 6][l_julei.ElementAt(j).word.ElementAt(y)].bursty);

                                            }
                                        }
                                        fengzi = fengzi / (l_julei.ElementAt(i).word.Count + l_julei.ElementAt(i).word.Count);
                                        temp_count = 0;
                                        for (int ju_i = 0; ju_i < l_julei.ElementAt(i).position.Count; ju_i++)
                                        {
                                            if (l_julei.ElementAt(j).position.Contains(l_julei.ElementAt(i).position.ElementAt(ju_i)))
                                            {
                                                temp_count++;
                                                l_temp_pos.Add(l_julei.ElementAt(i).position.ElementAt(ju_i));
                                            }
                                        }
                                        fenmu = (double)temp_count / (double)Datas.now_time_max;
                                        ju_temp = fengzi / fenmu;
                                        if (ju_temp < ju_min)
                                        {
                                            ju_min = ju_temp;
                                            ju_x = i;
                                            ju_y = j;
                                            l_posi = new List<int>(l_temp_pos);
                                        }
                                        fengzi = 0;
                                        fenmu = 0;
                                        l_temp_pos.Clear();
                                    }
                                    te = l_julei.OrderBy(ju_word => ju_word.word.Count).ToList();
                                }
                                if (ju_x == 0 && ju_y == 0)
                                    break;
                                //聚类一个
                                for (i = 0; i < l_julei.ElementAt(ju_y).word.Count; i++)
                                    l_julei.ElementAt(ju_x).word.Add(l_julei.ElementAt(ju_y).word.ElementAt(i));
                                l_julei.ElementAt(ju_x).position = new List<int>(l_posi);
                                l_julei.RemoveAt(ju_y);
                                f_z = ju_min;
                            }
                            List<ju_word> temp = new List<ju_word>();

                            for (i = 0; i < l_julei.Count; i++)
                                if (l_julei.ElementAt(i).word.Count > 1)
                                    temp.Add(l_julei.ElementAt(i));
                            Datas.ju_list[Datas.now_time_read%6] = temp.OrderByDescending(ju_word => ju_word.position.Count).ToList();

                            //this.Invoke(new updateListboxFreq(doUpdateListboxFreq), new object[] { Datas.ju_list });
                            if (!Datas.ProgressBarCompelet)
                                Datas.ProgressBarCompelet = true;
                        }

                        Datas.now_time_count = 0;
                        Datas.now_time_read++;
                    }
                }
            }
        }