/// <summary> /// Stem the token /// </summary> /// <param name="token"></param> internal static void Stem(Token token) { string word = token.OriginalWord.ToLower(); token.StemmedWord = stemmer.stemTerm(word); if (token.WordType == WordType.DEFAULT) { token.WordType = WordType.REGULAR; } }
public bool TryGetWordOrStem(IDataSource <string, ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> > source, string word, out ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> vad) { if (!source.TryGetValue(word, out vad)) { // try stemmed word string stem = stemmer.stemTerm(word); if (stem == word || !source.TryGetValue(stem, out vad)) { return(false); } } return(true); }
private int InsertIntoIndex(DocumentToIndex entry) { List <string> documentWords = entry.Words; IndexFileCount += 1; if (removeStopWords) { documentWords = documentWords.Where(x => !Constants.stopWords.Contains(x)).ToList(); } if (stemming) { for (int i = 0; i < documentWords.Count; i++) { documentWords[i] = stemmer.stemTerm(documentWords[i]); } } for (int i = 0; i < documentWords.Count; i++) { this.IndexWordCount++; if (!this.InvertedIndex.ContainsKey(documentWords[i])) { this.InvertedIndex[documentWords[i]] = new Dictionary <int, List <int> >(); this.InvertedIndex[documentWords[i]][this.currentId] = new List <int> { i }; } else { if (!this.InvertedIndex[documentWords[i]].ContainsKey(this.currentId)) { this.InvertedIndex[documentWords[i]][this.currentId] = new List <int> { i }; } else { this.InvertedIndex[documentWords[i]][this.currentId].Add(i); } } } this.IndexedFiles.Add(this.currentId, entry.DocumentName); this.currentId++; return(this.currentId - 1); }
public static string Process(string textToProcess) { StringBuilder builder = new StringBuilder(); string result = string.Empty; string stemmedWord; char[] delimiterChars = { ' ' }; string[] tokens = textToProcess.Split(delimiterChars); StemmerInterface porterStemmer = new PorterStemmer(); foreach (string token in tokens) { stemmedWord = porterStemmer.stemTerm(token); builder.AppendFormat("{0} ", stemmedWord); } result = builder.ToString().Trim(); return(result); }
public static void Main(string[] args) { ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass()); PorterStemmer stemmer = new PorterStemmer(); if (parsedArgs["stem"] != null) Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"])); if (parsedArgs["freqrows"] != null) { DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]); Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\""); } } if (parsedArgs["emotion"] != null) { ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]); for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++) Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]); } if (parsedArgs["emorows"] != null) { int rows = 0, valids = 0; ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { rows++; double[] emotions = sensor.EstimateEmotions(row[1]); Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\""); if (!double.IsNaN(emotions[0])) valids++; } } }
/// ------------------------------------------------------------------------------------ /// <summary> /// Finds the best key term in the list of words starting at m_iStartMatch and including /// up to m_iNextWord. As new words are considered, the list possible matches /// (m_matches) is reduced by any that no longer match until there is exactly one match /// that exactly equals the words in the key term or the list is empty. /// </summary> /// ------------------------------------------------------------------------------------ private KeyTermMatch FindBestKeyTerm() { Word nextWord = m_words[m_iNextWord]; if (m_iStartMatch == m_iNextWord) { List <KeyTermMatch> matches; if (!m_keyTermsTable.TryGetValue(nextWord, out matches)) { Word stem = s_stemmer.stemTerm(nextWord); if (m_keyTermsTable.TryGetValue(stem, out matches)) { stem.AddAlternateForm(nextWord); } else { m_iStartMatch++; return(null); } } m_matches = new List <KeyTermMatch>(matches.Where(m => m.AppliesTo(m_phrase.StartRef, m_phrase.EndRef))); // If we found a one-word exact match and there are no other key terms that start // with that word, then we return it. The code below would handle this, but it's such // a common case, we want it to be fast. If there are one or more multi-word key // terms that start with this word, we need to keep looking. if (m_matches.Count == 1 && m_matches[0].Words.Count() == 1) { return(m_matches[0]); } } int cMatchingWordsInTermSoFar = m_iNextWord - m_iStartMatch + 1; int lengthOfBestMatch = 0; KeyTermMatch longestMatch = null; // Remove from the possible matches any that don't match so far for (int iTerm = 0; iTerm < m_matches.Count; iTerm++) { KeyTermMatch term = m_matches[iTerm]; if (!PhraseEqualsKeyTermSoFar(term, cMatchingWordsInTermSoFar) || (AtEndOfPhrase && term.m_words.Count > cMatchingWordsInTermSoFar)) { m_matches.RemoveAt(iTerm--); } else if (term.m_words.Count > lengthOfBestMatch) { lengthOfBestMatch = term.m_words.Count; longestMatch = term; } } if (m_matches.Count == 0) { // The only matches we had were multi-word matches, and the addition of the current // word made it so that none of them matched. Therefore, we don't have a key term // starting at iStartMatch. m_iNextWord = m_iStartMatch; // The for loop in Parse will increment this. m_iStartMatch++; return(null); } if ((m_matches.Count == 1 && lengthOfBestMatch < cMatchingWordsInTermSoFar) || (lengthOfBestMatch == cMatchingWordsInTermSoFar)) { return(longestMatch); } return(null); }
public double[] EstimateEmotions(string text) { List <string> words = StringUtilities.SplitWords(text.ToLower(), true); // 3. Look up each word in ANEWFileSource double[] numer = new double[(int)Emotions.COUNT], denom = new double[(int)Emotions.COUNT]; for (int ii = 0; ii < (int)Emotions.COUNT; ii++) { numer[ii] = denom[ii] = 0; } foreach (string word in words) { if (word.StartsWith(" ") || word.Length <= 2) { continue; } ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> vad; if (!source.TryGetValue(word, out vad)) { // try stemmed word string stem = stemmer.stemTerm(word); if (stem == word || !source.TryGetValue(stem, out vad)) { continue; } } numer[(int)Emotions.Valence] += vad.one.Mean / vad.one.Variance; denom[(int)Emotions.Valence] += 1 / vad.one.Variance; numer[(int)Emotions.Arousal] += vad.two.Mean / vad.two.Variance; denom[(int)Emotions.Arousal] += 1 / vad.two.Variance; numer[(int)Emotions.Dominance] += vad.three.Mean / vad.three.Variance; denom[(int)Emotions.Dominance] += 1 / vad.three.Variance; // 4. Apply regressions from other paper ContinuousDistribution[,] vector = new ContinuousDistribution[, ] { { vad.one }, { vad.two }, { vad.three } }; ContinuousDistribution[,] emotions; if (vad.one.Mean >= .5) { emotions = RandomMatrix.Multiply(positiveProduct, vector); } else { emotions = RandomMatrix.Multiply(negativeProduct, vector); } // 5. Take mean within bounds and sum weighted by variance for (int ii = 3; ii < (int)Emotions.COUNT; ii++) { ContinuousDistribution clipped = emotions[ii - 3, 0].Transform(0, .1).Clip(0, 1); numer[ii] += clipped.Mean / clipped.Variance; denom[ii] += 1 / clipped.Variance; } } for (int ii = 0; ii < (int)Emotions.COUNT; ii++) { numer[ii] /= denom[ii]; } return(numer); }
public static void Main(string[] args) { ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass()); PorterStemmer stemmer = new PorterStemmer(); if (parsedArgs["stem"] != null) Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"])); /*ANEWEmotionSensor sensor2 = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); for (int rr = 0; rr < sensor2.positiveMatrix.GetLength(0); rr++) { for (int cc = 0; cc < sensor2.positiveMatrix.GetLength(1); cc++) Console.Write(sensor2.positiveMatrix[rr, cc] + ", "); Console.WriteLine(" - "); } for (int rr = 0; rr < sensor2.negativeMatrix.GetLength(0); rr++) { for (int cc = 0; cc < sensor2.negativeMatrix.GetLength(1); cc++) Console.Write(sensor2.negativeMatrix[rr, cc] + ", "); Console.WriteLine(" - "); } return;*/ if (parsedArgs["freqrows"] != null) { DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]); Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\""); } } if (parsedArgs["emotion"] != null) { ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]); for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++) Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]); } if (parsedArgs["emorows"] != null) { int rows = 0, valids = 0; ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { rows++; double[] emotions = sensor.EstimateEmotions(row[1]); Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\""); if (!double.IsNaN(emotions[0])) valids++; } } if (parsedArgs["eimpute"] != null) { ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); // DIAGNOSTIC /*List<List<string>> rows = new List<List<string>>(); rows.Add(TwitterUtilities.SplitWords("happy aaaa cccc")); rows.Add(TwitterUtilities.SplitWords("sad bbbb cccc")); IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 1000); foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed) Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean);*/ bool smallFile = false; if (smallFile) { DataReader reader = new DataReader(parsedArgs["f"]); List<List<string>> rows = new List<List<string>>(); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { Console.WriteLine(row); rows.Add(TwitterUtilities.SplitWords(row[10].ToLower())); } reader.Close(); /*IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 10); double minv = 1, maxv = 0; foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed) { minv = Math.Min(minv, kvp.Value.one.Mean); maxv = Math.Max(maxv, kvp.Value.one.Mean); Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + " x " + kvp.Value.one.Variance + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean); } Console.WriteLine("Min: " + minv + ", Max: " + maxv);*/ sensor.ImputeEmotionalContent(rows, 10, parsedArgs["f"] + "imputed"); } else { sensor.ImputeEmotionalContentFromFile(parsedArgs["f"], 11, 0, parsedArgs["f"].Substring(0, parsedArgs["f"].Length - 4) + "imputed.csv"); } uint jj = 0; using (var stream = File.CreateText(parsedArgs["f"] + "result")) { jj++; if (jj % 1000 == 0) Console.WriteLine("#" + jj); DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { double[] emotions = sensor.EstimateEmotions(row[11]); for (int ii = 0; ii < 11; ii++) stream.Write(row[ii] + ","); stream.WriteLine(emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7]); } } } }