예제 #1
0
        /// <summary>
        /// Returns the matrix of the task indices (columns) of each worker (rows).
        /// </summary>
        /// <param name="data">The data.</param>
        /// <param name="wordIndicesPerTaskIndex">Matrix of word indices for each tash index</param>
        /// <param name="wordCountsPerTaskIndex">Matrix of word counts for each task index</param>
        /// <returns>The matrix of the word indices (columns) of each task (rows).</returns>
        public void GetWordIndicesAndCountsPerTaskIndex(IEnumerable <Datum> data, out int[][] wordIndicesPerTaskIndex, out int[] wordCountsPerTaskIndex)
        {
            wordIndicesPerTaskIndex = new int[TaskCount][];
            wordCountsPerTaskIndex  = new int[TaskCount];
            string[] corpus = new string[TaskCount];

            // Dictionary keyed by task Id, with randomly order labelings
            var groupedRandomisedData =
                data.GroupBy(d => d.TaskId).
                Select(g =>
            {
                var arr  = g.ToArray();
                int cnt  = arr.Length;
                var perm = Rand.Perm(cnt);
                return(new
                {
                    key = g.Key,
                    arr = g.Select((t, i) => arr[perm[i]]).ToArray()
                });
            }).ToDictionary(a => a.key, a => a.arr);

            foreach (var kvp in groupedRandomisedData)
            {
                corpus[TaskIdToIndex[kvp.Key]] = kvp.Value.First().BodyText;
            }

            wordIndicesPerTaskIndex = TFIDFClass.GetWordIndexStemmedDocs(corpus, Vocabulary);
            wordCountsPerTaskIndex  = wordIndicesPerTaskIndex.Select(t => t.Length).ToArray();
        }
예제 #2
0
        /// <summary>
        /// Select high TFIDF terms
        /// </summary>
        /// <param name="corpus">array of terms</param>
        /// <param name="tfidf_threshold">TFIDF threshold</param>
        /// <returns></returns>
        private static List <string> BuildVocabularyFromCorpus(string[] corpus, double tfidf_threshold = 0.8)
        {
            List <string> vocabulary;

            double[][] inputs = TFIDFClass.Transform(corpus, out vocabulary, 0);
            inputs = TFIDFClass.Normalize(inputs);

            // Select high TF_IDF terms
            List <string> vocabularyTfidf = new List <string>();

            for (int index = 0; index < inputs.Length; index++)
            {
                var sortedTerms = inputs[index].Select((x, i) => new KeyValuePair <string, double>(vocabulary[i], x)).OrderByDescending(x => x.Value).ToList();
                vocabularyTfidf.AddRange(sortedTerms.Where(entry => entry.Value > tfidf_threshold).Select(k => k.Key).ToList());
            }
            return(vocabulary.Distinct().ToList());
        }