Beispiel #1
0
        public void Cluster(Corpus corpus, IEnumerable <string> word_sequence)
        {
            Dictionary <string, int> word_count_mapping = new Dictionary <string, int>();

            Dictionary <string, Dictionary <string, int> > word_sequence_count_mapping = new Dictionary <string, Dictionary <string, int> >();

            string prev_word = null;

            foreach (string word in word_sequence)
            {
                if (!corpus.Contains(word))
                {
                    prev_word = null;
                    continue;
                }

                if (word_count_mapping.ContainsKey(word))
                {
                    word_count_mapping[word] += 1;
                }
                else
                {
                    word_count_mapping[word] = 1;
                }

                if (prev_word == null)
                {
                    prev_word = word;
                    continue;
                }
                Dictionary <string, int> next_word_counts = null;
                if (word_sequence_count_mapping.ContainsKey(prev_word))
                {
                    next_word_counts = word_sequence_count_mapping[prev_word];
                }
                else
                {
                    next_word_counts = new Dictionary <string, int>();
                    word_sequence_count_mapping[prev_word] = next_word_counts;
                }

                if (next_word_counts.ContainsKey(word))
                {
                    next_word_counts[word] += 1;
                }
                else
                {
                    next_word_counts[word] = 1;
                }
            }

            string[] words      = word_count_mapping.Keys.ToArray();
            int      word_count = words.Length;

            int[]   word_counts          = new int[word_count];
            int[][] word_sequence_counts = new int[word_count][];

            for (int i = 0; i < word_count; ++i)
            {
                prev_word      = words[i];
                word_counts[i] = word_count_mapping[prev_word];

                Dictionary <string, int> next_word_count_mapping = null;
                if (word_sequence_count_mapping.ContainsKey(prev_word))
                {
                    next_word_count_mapping = word_sequence_count_mapping[prev_word];
                }
                int[] next_word_counts = new int[word_count];
                word_sequence_counts[i] = next_word_counts;
                for (int j = 0; j < word_count; ++j)
                {
                    string word = words[j];
                    if (next_word_count_mapping == null || !next_word_count_mapping.ContainsKey(word))
                    {
                        next_word_counts[j] = 0;
                    }
                    else
                    {
                        next_word_counts[j] = next_word_count_mapping[word];
                    }
                }
            }


            BinaryHeapMaxPQ <string> pq = new BinaryHeapMaxPQ <string>();

            pq.CompareKeys += (k1, k2) =>
            {
                return(word_count_mapping[k1].CompareTo(word_count_mapping[k2]));
            };
            foreach (string word in corpus)
            {
                pq.Insert(word);
            }

            int N = corpus.Count;
            int m = System.Math.Min(N, mM);
            HashSet <BCNode> clusters = new HashSet <BCNode>();

            for (int k = 0; k < m; ++k)
            {
                string frequent_word = pq.DeleteMax();
                BCNode node          = new BCNode(k, frequent_word);
                clusters.Add(node);
            }

            int ccount    = clusters.Count + 1;
            int tws_count = (ccount * (ccount - 1)) / 2;

            int parentClusterClassId = N;

            for (int k = m; k < N; ++k)
            {
                string next_frequent_word = pq.DeleteMax();
                BCNode node = new BCNode(k, next_frequent_word);
                clusters.Add(node);

                BCNode[] nodes = clusters.ToArray();

                Debug.Assert(nodes.Length == ccount);

                TaskFactory <ClusterTaskResult> tFactory = new TaskFactory <ClusterTaskResult>();

                Task <ClusterTaskResult>[] tasks = new Task <ClusterTaskResult> [tws_count];

                int tws_index = 0;
                for (int i = 0; i < nodes.Length - 1; ++i)
                {
                    for (int j = i + 1; j < nodes.Length; ++j)
                    {
                        Func <object, ClusterTaskResult> action = (Object obj) =>
                        {
                            int[] args   = (int[])obj;
                            int   ii     = args[0];
                            int   jj     = args[1];
                            int   taskId = args[2];

                            BCNode node1 = nodes[ii];
                            Dictionary <string, int> C1    = node1.ToST();
                            IEnumerable <string>     keys1 = C1.Keys;

                            BCNode node2      = nodes[jj];
                            int    clusterId2 = node2.ID;

                            Dictionary <string, int> tempC = new Dictionary <string, int>();
                            for (int l = 0; l < nodes.Length; ++l)
                            {
                                if (l != ii)
                                {
                                    BCNode.ToST(nodes[l], tempC, nodes[l].ID);
                                }
                            }

                            foreach (string key1 in keys1)
                            {
                                tempC[key1] = node2.ID;
                            }

                            double cQuality = _CalcClusterQuality(tempC, words, word_counts, word_sequence_counts);

                            //Console.WriteLine("task: {0} ii: {1} jj: {2} quality: {3:0.00}", taskId, ii, jj, cQuality);
                            ClusterTaskResult result = new ClusterTaskResult();
                            result.cQuality = cQuality;
                            result.Node1    = node1;
                            result.Node2    = node2;

                            return(result);
                        };

                        tasks[tws_index] = tFactory.StartNew(action, new int[] { i, j, tws_index });
                        tws_index++;
                    }
                }
                Debug.Assert(tws_index == tws_count);
                Task <ClusterTaskResult> .WaitAll(tasks);

                double maxCQuality = double.MinValue;

                BCNode selectedNode1 = null;
                BCNode selectedNode2 = null;

                for (int i = 0; i < tasks.Length; ++i)
                {
                    ClusterTaskResult result   = tasks[i].Result;
                    double            cQuality = result.cQuality;

                    if (cQuality > maxCQuality)
                    {
                        maxCQuality   = cQuality;
                        selectedNode1 = result.Node1;
                        selectedNode2 = result.Node2;
                    }
                }

                if (selectedNode1 == null || selectedNode2 == null)
                {
                    break;
                }

                Console.WriteLine("k : {0} Node1: {1} Node2: {2} Quality: {3}", k, selectedNode1.ID, selectedNode2.ID, maxCQuality);


                BCNode parentNode = new BCNode(parentClusterClassId++, null);
                parentNode.Left  = selectedNode1;
                parentNode.Right = selectedNode2;
                clusters.Remove(selectedNode1);
                clusters.Remove(selectedNode2);
                clusters.Add(parentNode);
            }

            for (int k = 0; k < m - 1; ++k)
            {
                BCNode[] nodes = clusters.ToArray();

                tws_count = (nodes.Length - 1) * nodes.Length / 2;

                TaskFactory <ClusterTaskResult> tFactory = new TaskFactory <ClusterTaskResult>();

                Task <ClusterTaskResult>[] tasks = new Task <ClusterTaskResult> [tws_count];

                int tws_index = 0;
                for (int i = 0; i < nodes.Length - 1; ++i)
                {
                    for (int j = i + 1; j < nodes.Length; ++j)
                    {
                        Func <object, ClusterTaskResult> action = (object obj) =>
                        {
                            int[] args = (int[])obj;
                            int   ii   = args[0];
                            int   jj   = args[1];

                            BCNode node1 = nodes[ii];
                            Dictionary <string, int> C1    = node1.ToST();
                            IEnumerable <string>     keys1 = C1.Keys;

                            BCNode node2       = nodes[jj];
                            int    classLabel2 = node2.ID;

                            Dictionary <string, int> tempC = new Dictionary <string, int>();
                            for (int l = 0; l < nodes.Length; ++l)
                            {
                                if (l != ii)
                                {
                                    BCNode.ToST(nodes[l], tempC, nodes[l].ID);
                                }
                            }

                            foreach (string key1 in keys1)
                            {
                                tempC[key1] = classLabel2;
                            }

                            double cQuality = _CalcClusterQuality(tempC, words, word_counts, word_sequence_counts);

                            ClusterTaskResult result = new ClusterTaskResult();
                            result.cQuality = cQuality;
                            result.Node1    = node1;
                            result.Node2    = node2;

                            return(result);
                        };
                        tasks[tws_index++] = tFactory.StartNew(action, new int[] { i, j });
                    }
                }

                Debug.Assert(tws_index == tws_count);
                Task <ClusterTaskResult> .WaitAll(tasks);

                double maxCQuality = double.MinValue;

                BCNode selectedNode1 = null;
                BCNode selectedNode2 = null;

                for (int i = 0; i < tasks.Length; ++i)
                {
                    ClusterTaskResult result   = tasks[i].Result;
                    double            cQuality = result.cQuality;

                    if (cQuality > maxCQuality)
                    {
                        maxCQuality   = cQuality;
                        selectedNode1 = result.Node1;
                        selectedNode2 = result.Node2;
                    }
                }

                if (selectedNode1 == null || selectedNode2 == null)
                {
                    break;
                }

                Console.WriteLine("K : {0} Node1: {1} Node2: {2} Quality: {3}", k, selectedNode1.ID, selectedNode2.ID, maxCQuality);

                BCNode parentNode = new BCNode(parentClusterClassId++, null);
                parentNode.Left  = selectedNode1;
                parentNode.Right = selectedNode2;
                clusters.Remove(selectedNode1);
                clusters.Remove(selectedNode2);
                clusters.Add(parentNode);
            }

            Debug.Assert(clusters.Count == 1);

            mClusterRoot = clusters.First();
        }