public void Cluster(Corpus corpus, IEnumerable <string> word_sequence) { Dictionary <string, int> word_count_mapping = new Dictionary <string, int>(); Dictionary <string, Dictionary <string, int> > word_sequence_count_mapping = new Dictionary <string, Dictionary <string, int> >(); string prev_word = null; foreach (string word in word_sequence) { if (!corpus.Contains(word)) { prev_word = null; continue; } if (word_count_mapping.ContainsKey(word)) { word_count_mapping[word] += 1; } else { word_count_mapping[word] = 1; } if (prev_word == null) { prev_word = word; continue; } Dictionary <string, int> next_word_counts = null; if (word_sequence_count_mapping.ContainsKey(prev_word)) { next_word_counts = word_sequence_count_mapping[prev_word]; } else { next_word_counts = new Dictionary <string, int>(); word_sequence_count_mapping[prev_word] = next_word_counts; } if (next_word_counts.ContainsKey(word)) { next_word_counts[word] += 1; } else { next_word_counts[word] = 1; } } string[] words = word_count_mapping.Keys.ToArray(); int word_count = words.Length; int[] word_counts = new int[word_count]; int[][] word_sequence_counts = new int[word_count][]; for (int i = 0; i < word_count; ++i) { prev_word = words[i]; word_counts[i] = word_count_mapping[prev_word]; Dictionary <string, int> next_word_count_mapping = null; if (word_sequence_count_mapping.ContainsKey(prev_word)) { next_word_count_mapping = word_sequence_count_mapping[prev_word]; } int[] next_word_counts = new int[word_count]; word_sequence_counts[i] = next_word_counts; for (int j = 0; j < word_count; ++j) { string word = words[j]; if (next_word_count_mapping == null || !next_word_count_mapping.ContainsKey(word)) { next_word_counts[j] = 0; } else { next_word_counts[j] = next_word_count_mapping[word]; } } } BinaryHeapMaxPQ <string> pq = new BinaryHeapMaxPQ <string>(); pq.CompareKeys += (k1, k2) => { return(word_count_mapping[k1].CompareTo(word_count_mapping[k2])); }; foreach (string word in corpus) { pq.Insert(word); } int N = corpus.Count; int m = System.Math.Min(N, mM); HashSet <BCNode> clusters = new HashSet <BCNode>(); for (int k = 0; k < m; ++k) { string frequent_word = pq.DeleteMax(); BCNode node = new BCNode(k, frequent_word); clusters.Add(node); } int ccount = clusters.Count + 1; int tws_count = (ccount * (ccount - 1)) / 2; int parentClusterClassId = N; for (int k = m; k < N; ++k) { string next_frequent_word = pq.DeleteMax(); BCNode node = new BCNode(k, next_frequent_word); clusters.Add(node); BCNode[] nodes = clusters.ToArray(); Debug.Assert(nodes.Length == ccount); TaskFactory <ClusterTaskResult> tFactory = new TaskFactory <ClusterTaskResult>(); Task <ClusterTaskResult>[] tasks = new Task <ClusterTaskResult> [tws_count]; int tws_index = 0; for (int i = 0; i < nodes.Length - 1; ++i) { for (int j = i + 1; j < nodes.Length; ++j) { Func <object, ClusterTaskResult> action = (Object obj) => { int[] args = (int[])obj; int ii = args[0]; int jj = args[1]; int taskId = args[2]; BCNode node1 = nodes[ii]; Dictionary <string, int> C1 = node1.ToST(); IEnumerable <string> keys1 = C1.Keys; BCNode node2 = nodes[jj]; int clusterId2 = node2.ID; Dictionary <string, int> tempC = new Dictionary <string, int>(); for (int l = 0; l < nodes.Length; ++l) { if (l != ii) { BCNode.ToST(nodes[l], tempC, nodes[l].ID); } } foreach (string key1 in keys1) { tempC[key1] = node2.ID; } double cQuality = _CalcClusterQuality(tempC, words, word_counts, word_sequence_counts); //Console.WriteLine("task: {0} ii: {1} jj: {2} quality: {3:0.00}", taskId, ii, jj, cQuality); ClusterTaskResult result = new ClusterTaskResult(); result.cQuality = cQuality; result.Node1 = node1; result.Node2 = node2; return(result); }; tasks[tws_index] = tFactory.StartNew(action, new int[] { i, j, tws_index }); tws_index++; } } Debug.Assert(tws_index == tws_count); Task <ClusterTaskResult> .WaitAll(tasks); double maxCQuality = double.MinValue; BCNode selectedNode1 = null; BCNode selectedNode2 = null; for (int i = 0; i < tasks.Length; ++i) { ClusterTaskResult result = tasks[i].Result; double cQuality = result.cQuality; if (cQuality > maxCQuality) { maxCQuality = cQuality; selectedNode1 = result.Node1; selectedNode2 = result.Node2; } } if (selectedNode1 == null || selectedNode2 == null) { break; } Console.WriteLine("k : {0} Node1: {1} Node2: {2} Quality: {3}", k, selectedNode1.ID, selectedNode2.ID, maxCQuality); BCNode parentNode = new BCNode(parentClusterClassId++, null); parentNode.Left = selectedNode1; parentNode.Right = selectedNode2; clusters.Remove(selectedNode1); clusters.Remove(selectedNode2); clusters.Add(parentNode); } for (int k = 0; k < m - 1; ++k) { BCNode[] nodes = clusters.ToArray(); tws_count = (nodes.Length - 1) * nodes.Length / 2; TaskFactory <ClusterTaskResult> tFactory = new TaskFactory <ClusterTaskResult>(); Task <ClusterTaskResult>[] tasks = new Task <ClusterTaskResult> [tws_count]; int tws_index = 0; for (int i = 0; i < nodes.Length - 1; ++i) { for (int j = i + 1; j < nodes.Length; ++j) { Func <object, ClusterTaskResult> action = (object obj) => { int[] args = (int[])obj; int ii = args[0]; int jj = args[1]; BCNode node1 = nodes[ii]; Dictionary <string, int> C1 = node1.ToST(); IEnumerable <string> keys1 = C1.Keys; BCNode node2 = nodes[jj]; int classLabel2 = node2.ID; Dictionary <string, int> tempC = new Dictionary <string, int>(); for (int l = 0; l < nodes.Length; ++l) { if (l != ii) { BCNode.ToST(nodes[l], tempC, nodes[l].ID); } } foreach (string key1 in keys1) { tempC[key1] = classLabel2; } double cQuality = _CalcClusterQuality(tempC, words, word_counts, word_sequence_counts); ClusterTaskResult result = new ClusterTaskResult(); result.cQuality = cQuality; result.Node1 = node1; result.Node2 = node2; return(result); }; tasks[tws_index++] = tFactory.StartNew(action, new int[] { i, j }); } } Debug.Assert(tws_index == tws_count); Task <ClusterTaskResult> .WaitAll(tasks); double maxCQuality = double.MinValue; BCNode selectedNode1 = null; BCNode selectedNode2 = null; for (int i = 0; i < tasks.Length; ++i) { ClusterTaskResult result = tasks[i].Result; double cQuality = result.cQuality; if (cQuality > maxCQuality) { maxCQuality = cQuality; selectedNode1 = result.Node1; selectedNode2 = result.Node2; } } if (selectedNode1 == null || selectedNode2 == null) { break; } Console.WriteLine("K : {0} Node1: {1} Node2: {2} Quality: {3}", k, selectedNode1.ID, selectedNode2.ID, maxCQuality); BCNode parentNode = new BCNode(parentClusterClassId++, null); parentNode.Left = selectedNode1; parentNode.Right = selectedNode2; clusters.Remove(selectedNode1); clusters.Remove(selectedNode2); clusters.Add(parentNode); } Debug.Assert(clusters.Count == 1); mClusterRoot = clusters.First(); }