Esempio n. 1
0
        /// <summary>
        /// Stem the token
        /// </summary>
        /// <param name="token"></param>
        internal static void Stem(Token token)
        {
            string word = token.OriginalWord.ToLower();

            token.StemmedWord = stemmer.stemTerm(word);
            if (token.WordType == WordType.DEFAULT)
            {
                token.WordType = WordType.REGULAR;
            }
        }
        public bool TryGetWordOrStem(IDataSource <string, ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> > source,
                                     string word, out ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> vad)
        {
            if (!source.TryGetValue(word, out vad))
            {
                // try stemmed word
                string stem = stemmer.stemTerm(word);
                if (stem == word || !source.TryGetValue(stem, out vad))
                {
                    return(false);
                }
            }

            return(true);
        }
Esempio n. 3
0
        private int InsertIntoIndex(DocumentToIndex entry)
        {
            List <string> documentWords = entry.Words;

            IndexFileCount += 1;

            if (removeStopWords)
            {
                documentWords = documentWords.Where(x => !Constants.stopWords.Contains(x)).ToList();
            }

            if (stemming)
            {
                for (int i = 0; i < documentWords.Count; i++)
                {
                    documentWords[i] = stemmer.stemTerm(documentWords[i]);
                }
            }

            for (int i = 0; i < documentWords.Count; i++)
            {
                this.IndexWordCount++;

                if (!this.InvertedIndex.ContainsKey(documentWords[i]))
                {
                    this.InvertedIndex[documentWords[i]] = new Dictionary <int, List <int> >();
                    this.InvertedIndex[documentWords[i]][this.currentId] = new List <int> {
                        i
                    };
                }
                else
                {
                    if (!this.InvertedIndex[documentWords[i]].ContainsKey(this.currentId))
                    {
                        this.InvertedIndex[documentWords[i]][this.currentId] = new List <int> {
                            i
                        };
                    }
                    else
                    {
                        this.InvertedIndex[documentWords[i]][this.currentId].Add(i);
                    }
                }
            }

            this.IndexedFiles.Add(this.currentId, entry.DocumentName);
            this.currentId++;
            return(this.currentId - 1);
        }
Esempio n. 4
0
        public static string Process(string textToProcess)
        {
            StringBuilder builder = new StringBuilder();
            string        result  = string.Empty;
            string        stemmedWord;

            char[] delimiterChars = { ' ' };

            string[] tokens = textToProcess.Split(delimiterChars);

            StemmerInterface porterStemmer = new PorterStemmer();

            foreach (string token in tokens)
            {
                stemmedWord = porterStemmer.stemTerm(token);
                builder.AppendFormat("{0} ", stemmedWord);
            }

            result = builder.ToString().Trim();

            return(result);
        }
Esempio n. 5
0
        public static void Main(string[] args)
        {
            ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass());

            PorterStemmer stemmer = new PorterStemmer();

            if (parsedArgs["stem"] != null)
                Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"]));

            if (parsedArgs["freqrows"] != null) {
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]);
                    Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\"");
                }
            }

            if (parsedArgs["emotion"] != null) {
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]);
                for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++)
                    Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]);
            }

            if (parsedArgs["emorows"] != null) {
                int rows = 0, valids = 0;
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    rows++;
                    double[] emotions = sensor.EstimateEmotions(row[1]);
                    Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\"");
                    if (!double.IsNaN(emotions[0]))
                        valids++;
                }
            }
        }
Esempio n. 6
0
        /// ------------------------------------------------------------------------------------
        /// <summary>
        /// Finds the best key term in the list of words starting at m_iStartMatch and including
        /// up to m_iNextWord. As new words are considered, the list possible matches
        /// (m_matches) is reduced by any that no longer match until there is exactly one match
        /// that exactly equals the words in the key term or the list is empty.
        /// </summary>
        /// ------------------------------------------------------------------------------------
        private KeyTermMatch FindBestKeyTerm()
        {
            Word nextWord = m_words[m_iNextWord];

            if (m_iStartMatch == m_iNextWord)
            {
                List <KeyTermMatch> matches;
                if (!m_keyTermsTable.TryGetValue(nextWord, out matches))
                {
                    Word stem = s_stemmer.stemTerm(nextWord);
                    if (m_keyTermsTable.TryGetValue(stem, out matches))
                    {
                        stem.AddAlternateForm(nextWord);
                    }
                    else
                    {
                        m_iStartMatch++;
                        return(null);
                    }
                }

                m_matches = new List <KeyTermMatch>(matches.Where(m => m.AppliesTo(m_phrase.StartRef, m_phrase.EndRef)));

                // If we found a one-word exact match and there are no other key terms that start
                // with that word, then we return it. The code below would handle this, but it's such
                // a common case, we want it to be fast. If there are one or more multi-word key
                // terms that start with this word, we need to keep looking.
                if (m_matches.Count == 1 && m_matches[0].Words.Count() == 1)
                {
                    return(m_matches[0]);
                }
            }

            int          cMatchingWordsInTermSoFar = m_iNextWord - m_iStartMatch + 1;
            int          lengthOfBestMatch         = 0;
            KeyTermMatch longestMatch = null;

            // Remove from the possible matches any that don't match so far
            for (int iTerm = 0; iTerm < m_matches.Count; iTerm++)
            {
                KeyTermMatch term = m_matches[iTerm];
                if (!PhraseEqualsKeyTermSoFar(term, cMatchingWordsInTermSoFar) ||
                    (AtEndOfPhrase && term.m_words.Count > cMatchingWordsInTermSoFar))
                {
                    m_matches.RemoveAt(iTerm--);
                }
                else if (term.m_words.Count > lengthOfBestMatch)
                {
                    lengthOfBestMatch = term.m_words.Count;
                    longestMatch      = term;
                }
            }

            if (m_matches.Count == 0)
            {
                // The only matches we had were multi-word matches, and the addition of the current
                // word made it so that none of them matched. Therefore, we don't have a key term
                // starting at iStartMatch.
                m_iNextWord = m_iStartMatch;                 // The for loop in Parse will increment this.
                m_iStartMatch++;
                return(null);
            }

            if ((m_matches.Count == 1 && lengthOfBestMatch < cMatchingWordsInTermSoFar) || (lengthOfBestMatch == cMatchingWordsInTermSoFar))
            {
                return(longestMatch);
            }

            return(null);
        }
Esempio n. 7
0
        public double[] EstimateEmotions(string text)
        {
            List <string> words = StringUtilities.SplitWords(text.ToLower(), true);

            // 3. Look up each word in ANEWFileSource
            double[] numer = new double[(int)Emotions.COUNT], denom = new double[(int)Emotions.COUNT];
            for (int ii = 0; ii < (int)Emotions.COUNT; ii++)
            {
                numer[ii] = denom[ii] = 0;
            }

            foreach (string word in words)
            {
                if (word.StartsWith(" ") || word.Length <= 2)
                {
                    continue;
                }

                ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> vad;
                if (!source.TryGetValue(word, out vad))
                {
                    // try stemmed word
                    string stem = stemmer.stemTerm(word);
                    if (stem == word || !source.TryGetValue(stem, out vad))
                    {
                        continue;
                    }
                }

                numer[(int)Emotions.Valence]   += vad.one.Mean / vad.one.Variance;
                denom[(int)Emotions.Valence]   += 1 / vad.one.Variance;
                numer[(int)Emotions.Arousal]   += vad.two.Mean / vad.two.Variance;
                denom[(int)Emotions.Arousal]   += 1 / vad.two.Variance;
                numer[(int)Emotions.Dominance] += vad.three.Mean / vad.three.Variance;
                denom[(int)Emotions.Dominance] += 1 / vad.three.Variance;

                // 4. Apply regressions from other paper
                ContinuousDistribution[,] vector = new ContinuousDistribution[, ] {
                    { vad.one }, { vad.two }, { vad.three }
                };

                ContinuousDistribution[,] emotions;
                if (vad.one.Mean >= .5)
                {
                    emotions = RandomMatrix.Multiply(positiveProduct, vector);
                }
                else
                {
                    emotions = RandomMatrix.Multiply(negativeProduct, vector);
                }

                // 5. Take mean within bounds and sum weighted by variance
                for (int ii = 3; ii < (int)Emotions.COUNT; ii++)
                {
                    ContinuousDistribution clipped = emotions[ii - 3, 0].Transform(0, .1).Clip(0, 1);
                    numer[ii] += clipped.Mean / clipped.Variance;
                    denom[ii] += 1 / clipped.Variance;
                }
            }

            for (int ii = 0; ii < (int)Emotions.COUNT; ii++)
            {
                numer[ii] /= denom[ii];
            }

            return(numer);
        }
Esempio n. 8
0
        public static void Main(string[] args)
        {
            ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass());

            PorterStemmer stemmer = new PorterStemmer();

            if (parsedArgs["stem"] != null)
                Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"]));

            /*ANEWEmotionSensor sensor2 = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
            for (int rr = 0; rr < sensor2.positiveMatrix.GetLength(0); rr++) {
                for (int cc = 0; cc < sensor2.positiveMatrix.GetLength(1); cc++)
                    Console.Write(sensor2.positiveMatrix[rr, cc] + ", ");
                Console.WriteLine(" - ");
            }
            for (int rr = 0; rr < sensor2.negativeMatrix.GetLength(0); rr++) {
                for (int cc = 0; cc < sensor2.negativeMatrix.GetLength(1); cc++)
                    Console.Write(sensor2.negativeMatrix[rr, cc] + ", ");
                Console.WriteLine(" - ");
            }
            return;*/

            if (parsedArgs["freqrows"] != null) {
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]);
                    Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\"");
                }
            }

            if (parsedArgs["emotion"] != null) {
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]);
                for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++)
                    Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]);
            }

            if (parsedArgs["emorows"] != null) {
                int rows = 0, valids = 0;
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    rows++;
                    double[] emotions = sensor.EstimateEmotions(row[1]);
                    Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\"");
                    if (!double.IsNaN(emotions[0]))
                        valids++;
                }
            }

            if (parsedArgs["eimpute"] != null) {
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");

                // DIAGNOSTIC
                /*List<List<string>> rows = new List<List<string>>();
                rows.Add(TwitterUtilities.SplitWords("happy aaaa cccc"));
                rows.Add(TwitterUtilities.SplitWords("sad bbbb cccc"));

                IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 1000);
                foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed)
                    Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean);*/

                bool smallFile = false;
                if (smallFile) {
                    DataReader reader = new DataReader(parsedArgs["f"]);
                    List<List<string>> rows = new List<List<string>>();
                    for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                        Console.WriteLine(row);
                        rows.Add(TwitterUtilities.SplitWords(row[10].ToLower()));
                    }
                    reader.Close();

                    /*IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 10);
                    double minv = 1, maxv = 0;
                    foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed) {
                        minv = Math.Min(minv, kvp.Value.one.Mean);
                        maxv = Math.Max(maxv, kvp.Value.one.Mean);
                        Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + " x " + kvp.Value.one.Variance + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean);
                    }

                    Console.WriteLine("Min: " + minv + ", Max: " + maxv);*/

                    sensor.ImputeEmotionalContent(rows, 10, parsedArgs["f"] + "imputed");
                } else {
                    sensor.ImputeEmotionalContentFromFile(parsedArgs["f"], 11, 0, parsedArgs["f"].Substring(0, parsedArgs["f"].Length - 4) + "imputed.csv");
                }

                uint jj = 0;
                using (var stream = File.CreateText(parsedArgs["f"] + "result")) {
                    jj++;
                    if (jj % 1000 == 0)
                        Console.WriteLine("#" + jj);

                    DataReader reader = new DataReader(parsedArgs["f"]);
                    for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                        double[] emotions = sensor.EstimateEmotions(row[11]);
                        for (int ii = 0; ii < 11; ii++)
                            stream.Write(row[ii] + ",");
                        stream.WriteLine(emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7]);
                    }
                }
            }
        }