Пример #1
0
        public static double[] ONLY_TF_ExtractNGramTfIdfVector(TokenizedArticle article, List <TokenizedArticle> documents, int n = 3)
        {
            double[]      resultVector = new double[article.Tokens.Count - n];
            List <string> nGrams       = new List <string>();

            for (int i = 0; i < article.Tokens.Count - n; i++)
            {
                var ngramToken = String.Join(" ", article.Tokens.Select(t => t.Word).Skip(i).Take(n).ToArray());
                nGrams.Add(ngramToken);
            }
            //todo ADD KEYWORDS and maybe dict instead of aray
//            var documentsTokens = documents.Select(d => d.Tokens.Select(at => at.Word).ToList()).ToList();
            return(nGrams.Select(g => CalculateTermFrequency(g, nGrams)).ToArray());
        }
Пример #2
0
        private static Dictionary <string, int> GetMostFrequentTermsForLabel(List <LabeledArticle> articles, string label, int termCount = 20, string[] stopList = null)
        {
            List <TokenizedArticle> tokenizedArticles = new List <TokenizedArticle>();

//            List<TokenizedArticle> allTokenizedArticles = new List<TokenizedArticle>();

            foreach (var article in articles)
            {
//                allTokenizedArticles.Add(tokenized);
                if (article.Label == label)
                {
                    var art            = TextUtility.ReplaceSpecialCharacters(article.Article.Body);
                    var processedWords = StopWordsFilterProcessor.Process(Tokenizer.TokenizeWords(art), stopList);
                    processedWords = Lemmatizer.Process(processedWords);
                    var tokenized = new TokenizedArticle(article, processedWords);
                    tokenizedArticles.Add(tokenized);
                }
            }


            Dictionary <string, int> countDictionary = new Dictionary <string, int>();

            foreach (var tokenizedArticle in tokenizedArticles)
            {
                foreach (var token in tokenizedArticle.Tokens)
                {
                    if (countDictionary.ContainsKey(token.Word))
                    {
                        countDictionary[token.Word]++;
                    }
                    else
                    {
                        countDictionary[token.Word] = 1;
                    }
                }
            }

            return(countDictionary
                   .OrderByDescending(pair => pair.Value)
                   .Take(termCount)
                   .ToDictionary(pair => pair.Key, pair => pair.Value));
        }
Пример #3
0
 public static double[] ExtractTfIdfVector(TokenizedArticle article, List <TokenizedArticle> documents)
 {
     return(article.Tokens.Select(t => CalculateTfIdf(t.Word, article.Tokens.Select(at => at.Word).ToList(), documents.Select(d => d.Tokens.Select(at => at.Word).ToList()).ToList())).ToArray());
 }