public static double[] ONLY_TF_ExtractNGramTfIdfVector(TokenizedArticle article, List <TokenizedArticle> documents, int n = 3) { double[] resultVector = new double[article.Tokens.Count - n]; List <string> nGrams = new List <string>(); for (int i = 0; i < article.Tokens.Count - n; i++) { var ngramToken = String.Join(" ", article.Tokens.Select(t => t.Word).Skip(i).Take(n).ToArray()); nGrams.Add(ngramToken); } //todo ADD KEYWORDS and maybe dict instead of aray // var documentsTokens = documents.Select(d => d.Tokens.Select(at => at.Word).ToList()).ToList(); return(nGrams.Select(g => CalculateTermFrequency(g, nGrams)).ToArray()); }
private static Dictionary <string, int> GetMostFrequentTermsForLabel(List <LabeledArticle> articles, string label, int termCount = 20, string[] stopList = null) { List <TokenizedArticle> tokenizedArticles = new List <TokenizedArticle>(); // List<TokenizedArticle> allTokenizedArticles = new List<TokenizedArticle>(); foreach (var article in articles) { // allTokenizedArticles.Add(tokenized); if (article.Label == label) { var art = TextUtility.ReplaceSpecialCharacters(article.Article.Body); var processedWords = StopWordsFilterProcessor.Process(Tokenizer.TokenizeWords(art), stopList); processedWords = Lemmatizer.Process(processedWords); var tokenized = new TokenizedArticle(article, processedWords); tokenizedArticles.Add(tokenized); } } Dictionary <string, int> countDictionary = new Dictionary <string, int>(); foreach (var tokenizedArticle in tokenizedArticles) { foreach (var token in tokenizedArticle.Tokens) { if (countDictionary.ContainsKey(token.Word)) { countDictionary[token.Word]++; } else { countDictionary[token.Word] = 1; } } } return(countDictionary .OrderByDescending(pair => pair.Value) .Take(termCount) .ToDictionary(pair => pair.Key, pair => pair.Value)); }
public static double[] ExtractTfIdfVector(TokenizedArticle article, List <TokenizedArticle> documents) { return(article.Tokens.Select(t => CalculateTfIdf(t.Word, article.Tokens.Select(at => at.Word).ToList(), documents.Select(d => d.Tokens.Select(at => at.Word).ToList()).ToList())).ToArray()); }