/// <summary> /// Returns the matrix of the task indices (columns) of each worker (rows). /// </summary> /// <param name="data">The data.</param> /// <param name="wordIndicesPerTaskIndex">Matrix of word indices for each tash index</param> /// <param name="wordCountsPerTaskIndex">Matrix of word counts for each task index</param> /// <returns>The matrix of the word indices (columns) of each task (rows).</returns> public void GetWordIndicesAndCountsPerTaskIndex(IEnumerable <Datum> data, out int[][] wordIndicesPerTaskIndex, out int[] wordCountsPerTaskIndex) { wordIndicesPerTaskIndex = new int[TaskCount][]; wordCountsPerTaskIndex = new int[TaskCount]; string[] corpus = new string[TaskCount]; // Dictionary keyed by task Id, with randomly order labelings var groupedRandomisedData = data.GroupBy(d => d.TaskId). Select(g => { var arr = g.ToArray(); int cnt = arr.Length; var perm = Rand.Perm(cnt); return(new { key = g.Key, arr = g.Select((t, i) => arr[perm[i]]).ToArray() }); }).ToDictionary(a => a.key, a => a.arr); foreach (var kvp in groupedRandomisedData) { corpus[TaskIdToIndex[kvp.Key]] = kvp.Value.First().BodyText; } wordIndicesPerTaskIndex = TFIDFClass.GetWordIndexStemmedDocs(corpus, Vocabulary); wordCountsPerTaskIndex = wordIndicesPerTaskIndex.Select(t => t.Length).ToArray(); }
/// <summary> /// Select high TFIDF terms /// </summary> /// <param name="corpus">array of terms</param> /// <param name="tfidf_threshold">TFIDF threshold</param> /// <returns></returns> private static List <string> BuildVocabularyFromCorpus(string[] corpus, double tfidf_threshold = 0.8) { List <string> vocabulary; double[][] inputs = TFIDFClass.Transform(corpus, out vocabulary, 0); inputs = TFIDFClass.Normalize(inputs); // Select high TF_IDF terms List <string> vocabularyTfidf = new List <string>(); for (int index = 0; index < inputs.Length; index++) { var sortedTerms = inputs[index].Select((x, i) => new KeyValuePair <string, double>(vocabulary[i], x)).OrderByDescending(x => x.Value).ToList(); vocabularyTfidf.AddRange(sortedTerms.Where(entry => entry.Value > tfidf_threshold).Select(k => k.Key).ToList()); } return(vocabulary.Distinct().ToList()); }