PorterStemmer C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

파일: PorterStemmerTests.cs 프로젝트: IhorHolovatsky/Devchallenge17

        public void Stem(string word, string wordExpected)
        {
            // Arrange
            var stemmer = new PorterStemmer();

            // Act
            var wordStemmed = stemmer.Stem(word);

            // Assert
            Assert.AreEqual(wordExpected, wordStemmed);
        }

예제 #2

0

파일 보기

        public LinkedList <IndexEntry> Execute(string query, CrawlerRegistry registry, int maxResults = 25, bool usePageRank = false)
        {
            var tokens = new List <string>(query.ToLower().Split(' '));

            tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token));

            var stemmer       = new PorterStemmer();
            var stemmedTokens = new HashSet <string>(tokens.Select(token => stemmer.StemWord(token.ToLower())));

            return(this.Execute(stemmedTokens.ToList(), registry, maxResults, usePageRank));
        }

예제 #3

0

파일 보기

        public void TestFromPorter()
        {
            var stemmer = new PorterStemmer("en");

            foreach (var pair in dic)
            {
                var stem = stemmer.Stem(pair.Key);

                Assert.AreEqual(pair.Value, stem);
            }
        }

예제 #4

0

파일 보기

파일: BooleanQuery.cs 프로젝트: AngelOD/AAU-WI

        public QueryPart ParseQuery(string input)
        {
            var tokens = new List <string>(input.ToLower().Split(' '));

            tokens.RemoveAll(token => token.Length <= 1 || StopWords.BooleanStopWordsList.Contains(token));

            var stemmer       = new PorterStemmer();
            var stemmedTokens = new List <string>(tokens.Select(token => StopWords.BooleanWords.Contains(token.ToLower()) ? token.ToUpper() : stemmer.StemWord(token.ToLower())));

            return(this.ParseQuery(stemmedTokens));
        }

예제 #5

0

파일 보기

파일: Indexing.cs 프로젝트: krestpel15/BIT694_TMA3

        //inverted index
        public Dictionary <string, Dictionary <string, double> > InvertedIndex(string folder)
        {
            converter = new Converter();
            if (internalIndex != null)
            {
                internalIndex.Clear();
            }                                                                         // clears the memory usage of exisitng Index
            internalIndex = new Dictionary <string, Dictionary <string, double> >();  // the invertedIndex to be returned
            indexCount    = 0;                                                        // a counter for how large the inverted index is.

            Dictionary <string, double> fileList = new Dictionary <string, double>(); // a list to populate the files that match a term
            PorterStemmer stemmer = new PorterStemmer();                              // instantiate a PorterStemmer object to stem words from files

            foreach (string file in IndexingFolders(folder))
            {
                //int fileID = converter.assignID(file); // create an Id from the string of the file and store in HashMap Converter.paths

                foreach (string word in ScanFiles.scanFiles(file))
                {
                    // stem the word
                    string stemmedWord = stemmer.StemWord(word);

                    // create the Dictionary for the collection
                    if (internalIndex.ContainsKey(stemmedWord))
                    {
                        fileList = internalIndex[stemmedWord];

                        // check if the file is already in the list or not
                        if (fileList.ContainsKey(file))
                        {
                            fileList[file] = double.Parse(fileList[file].ToString()) + 1;
                        }
                        else
                        {
                            fileList.Add(file, 1.0);
                        }

                        internalIndex[stemmedWord] = fileList;
                    }
                    else
                    {
                        // create a new key and start new List of files for the key
                        fileList = new Dictionary <string, double>
                        {
                            { file, 1.0 }
                        };
                        internalIndex.Add(stemmedWord, fileList);
                        indexCount++;
                    }
                }
            }
            return(internalIndex);
        }

예제 #6

0

파일 보기

파일: TestPorterStemmer.cs 프로젝트: MartyAddante/WordCountApp

        public void Test_StemWordOutPut_Matches_StaticOutput()
        {
            string filepath = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory);

            filepath = Directory.GetParent(Directory.GetParent(Directory.GetParent(filepath).FullName).FullName).FullName;

            List <string> StaticOutput = new List <string>();
            List <string> TestOutput   = new List <string>();

            try
            {
                string staticOutputPath = filepath + @"\StemmerTestFiles\OutputWords.txt";
                using (var stream = new StreamReader(staticOutputPath))
                {
                    string line = stream.ReadLine();
                    while (line != null)
                    {
                        StaticOutput.Add(line);
                        line = stream.ReadLine();
                    }
                }
            }
            catch (IOException e)
            {
                Console.WriteLine("The file could not be read:");
                Console.WriteLine(e.Message);
            }

            PorterStemmer ps = new PorterStemmer();

            try
            {
                string staticOutputPath = filepath + @"\StemmerTestFiles\RawWords.txt";
                using (var stream = new StreamReader(staticOutputPath))
                {
                    string line = stream.ReadLine();
                    while (line != null)
                    {
                        TestOutput.Add(ps.StemWord(line));
                        line = stream.ReadLine();
                    }
                }
            }
            catch (IOException e)
            {
                Console.WriteLine("The file could not be read:");
                Console.WriteLine(e.Message);
            }


            Assert.IsTrue(StaticOutput.SequenceEqual(TestOutput));
        }

예제 #7

0

파일 보기

파일: Functions.cs 프로젝트: Talidyn/FrequencyCalculator

        /// <summary>
        /// Transforms the list of terms into their root form.
        /// Uses the Porter Stemming alogorithm.
        /// </summary>
        /// <param name="wordList">List of terms to edit</param>
        /// <returns></returns>
        public static List <string> StemWords(List <string> wordList)
        {
            var stemmer = new PorterStemmer();

            List <string> stemmedList = new List <string>();

            foreach (var item in wordList)
            {
                stemmedList.Add(stemmer.StemWord(item));
            }

            return(stemmedList);
        }

예제 #8

0

파일 보기

파일: Helpers.cs 프로젝트: smendimasa/Lawit

        public static List <DocumentResult> Search(string input)
        {
            //  Take the input, split up into words while discarding symbols and numbers, then remove the stop words and set all cases to lowercase
            var tokens = input.Split(" ", StringSplitOptions.RemoveEmptyEntries)
                         .Select(x => (x.Where(c => char.IsLetter(c)).Aggregate("", (current, c) => current + c)).ToLower());

            tokens = tokens.Where(x => !BLL.Constants.stopwords.Contains(x)).Distinct();
            var stemmedTokens = new List <string>();
            // Instantiate the stemmer
            PorterStemmer stem = new PorterStemmer();

            // Stem all the words in the input and add to the list
            foreach (var word in tokens)
            {
                stem.SetCurrent(word);
                stem.Stem();
                var result = stem.Current;
                stemmedTokens.Add(result);
            }
            // just in case some words have common stems, we apply the Distinct filter again
            var words = stemmedTokens.Distinct();
            // Get all word ids of cleaned token list
            var wordIds = _context.Word.Where(x => words.Contains(x.Word1)).Select(x => x.WordId).ToList();
            // Generate list od DocumentIds based on words and get the top 10
            var pull        = _context.DocumentWord.ToList();
            var svd         = pull.Where(x => wordIds.Contains(x.WordId));
            var totalcounts = _context.DocumentWord.ToList().GroupBy(g => g.DocumentId).ToDictionary(x => x.Key, x => x.Sum(z => z.Count));
            var counts      = svd.GroupBy(g => g.DocumentId).Select(y => new
            {
                DocumentId = y.Key,
                Counts     = totalcounts[y.Key] != 0 ? y.Sum(z => z.Count) / totalcounts[y.Key] : 0
            });

            var                   top10        = counts.OrderByDescending(c => c.Counts).Take(10);
            var                   documentIds  = top10.Select(x => x.DocumentId).ToList();
            List <int>            filteredDocs = _context.Document.Where(x => documentIds.Contains(x.DocumentId)).ToList().Select(x => x.DocumentId).ToList();
            var                   subtitles    = _context.Subtitle.Select(x => new { x.SubtitleId, x.SubtitleName, x.SubtitleNumber }).ToDictionary(x => x.SubtitleId, x => new { x.SubtitleName, x.SubtitleNumber });
            List <DocumentResult> documents    = _context.Document.Where(x => filteredDocs.Contains(x.DocumentId)).Include(j => j.Title).Select(y => new DocumentResult
            {
                DocumentText   = y.DocumentText,
                SubtitleName   = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleName : "",
                SubtitleNumber = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleNumber : "",
                TitleName      = y.Title.TitleName,
                TitleNumber    = y.Title.TitleNumber,
                Citation       = y.UniversalCitation,
                DocumentHeader = y.DocumentHeader
            }).ToList();

            return(documents);
        }

예제 #9

0

파일 보기

파일: AnswerService.cs 프로젝트: rimever/NLP100Knocks

        /// <summary>
        /// 52. ステミング
        /// 51の出力を入力として受け取り，Porterのステミングアルゴリズムを適用し，単語と語幹をタブ区切り形式で出力せよ． Pythonでは，Porterのステミングアルゴリズムの実装としてstemmingモジュールを利用するとよい．
        /// </summary>
        public void Answer52()
        {
            PorterStemmer porterStemmer = new PorterStemmer();

            foreach (var sentence in SplitSentence())
            {
                foreach (var word in SplitWords(sentence))
                {
                    var stem = porterStemmer.StemWord(word);
                    Console.WriteLine($"{word}\t{stem}");
                }

                Console.WriteLine();
            }
        }

예제 #10

0

파일 보기

파일: FilteredUnigram.cs 프로젝트: Tesstarosa/NeuroGus

        public HashSet <string> GetNGram(string text)
        {
            // get all significant words
            var words = Regex.Split(Clean(text), $@"[ \n\t\r$+<>№=]");

            // remove endings of words
            for (int i = 0; i < words.Length; i++)
            {
                words[i] = PorterStemmer.TransformingWord(words[i]);
            }

            var uniqueValues = new HashSet <string>(words);

            uniqueValues.RemoveWhere((s) => s.Equals(""));

            return(uniqueValues);
        }

예제 #11

0

파일 보기

파일: TfIdfSimilarityScoringTests.cs 프로젝트: IhorHolovatsky/Devchallenge17

        public async Task FindDuplicates(string text)
        {
            // Arrange
            var stemmer          = new PorterStemmer();
            var textPreprocessor = new TextPreprocessor(stemmer);
            var document         = new Document
            {
                Id     = 3,
                Tokens = textPreprocessor.Tokenize(text)
            };
            var tfIdfSimilarityScoring = await BuildService(textPreprocessor);

            // Act
            var scores = await tfIdfSimilarityScoring.GetSimilarityScoresAsync(document);

            // Assert
            Assert.IsTrue(scores.Any(s => s.Score > 0.5d));
        }

예제 #12

0

파일 보기

파일: ANEWEmotionSensor.cs 프로젝트: lijoabraham/Virsona-ChatBot-Tools

        public ANEWEmotionSensor(string datadirectory)
        {
            // Data files contained in [datadrectory]/metrics
            string metricsdir = datadirectory + Path.DirectorySeparatorChar + "metrics" + Path.DirectorySeparatorChar;

            source  = new MemoizedSource <string, ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> >(new ANEWFileSource(metricsdir + "anew.csv"));
            stemmer = new PorterStemmer();

            // These matrices are used in G emotion = vad
            // positives are mean > 5; negatives are mean < 5
            double[,] positives = new double[, ] {
                { .890, -.020, -.110, .116, -.035 },
                { .649, .139, -.287, .441, .051 },
                { .601, .153, -.305, .125, .042 }
            };
            double[,] positiveTs = new double[, ] {
                { 45.40, 0.73, 4.24, 4.95, 1.55 },
                { 19.75, 2.984, 6.57, 11.26, 1.36 },
                { 16.60, 2.98, 6.34, 2.88, 1.00 }
            };
            double[,] positiveSEs = Matrix.ElementwiseDivide(positives, positiveTs);
            double[,] negatives   = new double[, ] {
                { .291, -.044, -.515, .020, -.243 },
                { .050, .492, -.309, .670, -.042 },
                { .136, .369, -.625, -.144, .041 }
            };
            double[,] negativeTs = new double[, ] {
                { 8.91, 1.27, 13.80, 0.58, 8.27 },
                { 1.36, 12.59, 7.33, 17.11, 1.27 },
                { 2.93, 7.49, 11.75, 2.91, 0.98 }
            };
            double[,] negativeSEs = Matrix.ElementwiseDivide(negatives, negativeTs);

            ContinuousDistribution[,] randomPositives = RandomMatrix.MakeGaussians(positives, positiveSEs);
            ContinuousDistribution[,] randomNegatives = RandomMatrix.MakeGaussians(negatives, negativeSEs);

            positiveProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomPositives),
                                                    RandomMatrix.Inverse(RandomMatrix.Multiply(randomPositives, RandomMatrix.Transpose(randomPositives))));
            negativeProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomNegatives),
                                                    RandomMatrix.Inverse(RandomMatrix.Multiply(randomNegatives, RandomMatrix.Transpose(randomNegatives))));

            positiveMatrix = Matrix.Multiply(Matrix.Transpose(positives), Matrix.Inverse(Matrix.Multiply(positives, Matrix.Transpose(positives))));
            negativeMatrix = Matrix.Multiply(Matrix.Transpose(negatives), Matrix.Inverse(Matrix.Multiply(negatives, Matrix.Transpose(negatives))));
        }

예제 #13

0

파일 보기

        private void searchButton_Click(object sender, EventArgs e)
        {
            PorterStemmer stemmer     = new PorterStemmer();               // instantiate a PorterStemmer object to stem words from files
            string        stemmedWord = stemmer.StemWord(searchWord.Text); //stems the word before searching

            fileList.Text   = " ";
            filesFound.Text = " ";

            List <string> files = new List <string>();
            bool          found = false;

            if (thread.IsAlive)
            {
                MessageBox.Show("The index is currently busy. Please try again later"); //message shown if index building is in progress
            }
            else
            {
                foreach (var item in index.internalIndex)
                {
                    string newItem = StopWords.RemoveStopwords(item.Key);

                    if (newItem == stemmedWord)
                    {
                        found           = true;
                        filesFound.Text = item.Value.Count.ToString();

                        foreach (var folderName in item.Value.Keys)
                        {
                            files.Add(folderName);
                        }
                    }
                }

                foreach (var file in files)
                {
                    fileList.Text += file + "\r\n";
                }
                if (!found)
                {
                    filesFound.Text = "0";
                    fileList.Text   = "No results found";
                }
            }
        }

예제 #14

0

파일 보기

        public static string Process(string textToProcess)
        {
            StringBuilder builder = new StringBuilder();
            string        result  = string.Empty;
            string        stemmedWord;

            char[] delimiterChars = { ' ' };

            string[] tokens = textToProcess.Split(delimiterChars);

            StemmerInterface porterStemmer = new PorterStemmer();

            foreach (string token in tokens)
            {
                stemmedWord = porterStemmer.stemTerm(token);
                builder.AppendFormat("{0} ", stemmedWord);
            }

            result = builder.ToString().Trim();

            return(result);
        }

예제 #15

0

파일 보기

파일: ANEWEmotionSensor.cs 프로젝트: killix/Virsona-ChatBot-Tools

        public ANEWEmotionSensor(string datadirectory)
        {
            // Data files contained in [datadrectory]/metrics
            string metricsdir = datadirectory + Path.DirectorySeparatorChar + "metrics" + Path.DirectorySeparatorChar;
            source = new MemoizedSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>>(new ANEWFileSource(metricsdir + "anew.csv"));
            stemmer = new PorterStemmer();

            // These matrices are used in G emotion = vad
            // positives are mean > 5; negatives are mean < 5
            double[,] positives = new double[,] {
                {.890, -.020, -.110, .116, -.035},
                {.649, .139, -.287, .441, .051},
                {.601, .153, -.305, .125, .042}};
            double[,] positiveTs = new double[,] {
                {45.40, 0.73, 4.24, 4.95, 1.55},
                {19.75, 2.984, 6.57, 11.26, 1.36},
                {16.60, 2.98, 6.34, 2.88, 1.00}};
            double[,] positiveSEs = Matrix.ElementwiseDivide(positives, positiveTs);
            double[,] negatives = new double[,] {
                {.291, -.044, -.515, .020, -.243},
                {.050, .492, -.309, .670, -.042},
                {.136, .369, -.625, -.144, .041}};
            double[,] negativeTs = new double[,] {
                {8.91, 1.27, 13.80, 0.58, 8.27},
                {1.36, 12.59, 7.33, 17.11, 1.27},
                {2.93, 7.49, 11.75, 2.91, 0.98}};
            double[,] negativeSEs = Matrix.ElementwiseDivide(negatives, negativeTs);

            ContinuousDistribution[,] randomPositives = RandomMatrix.MakeGaussians(positives, positiveSEs);
            ContinuousDistribution[,] randomNegatives = RandomMatrix.MakeGaussians(negatives, negativeSEs);

            positiveProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomPositives),
                                                    RandomMatrix.Inverse(RandomMatrix.Multiply(randomPositives, RandomMatrix.Transpose(randomPositives))));
            negativeProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomNegatives),
                                                    RandomMatrix.Inverse(RandomMatrix.Multiply(randomNegatives, RandomMatrix.Transpose(randomNegatives))));

            positiveMatrix = Matrix.Multiply(Matrix.Transpose(positives), Matrix.Inverse(Matrix.Multiply(positives, Matrix.Transpose(positives))));
            negativeMatrix = Matrix.Multiply(Matrix.Transpose(negatives), Matrix.Inverse(Matrix.Multiply(negatives, Matrix.Transpose(negatives))));
        }

예제 #16

0

파일 보기

파일: Main.cs 프로젝트: jhashemi/Virsona-ChatBot-Tools

        public static void Main(string[] args)
        {
            ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass());

            PorterStemmer stemmer = new PorterStemmer();

            if (parsedArgs["stem"] != null)
                Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"]));

            if (parsedArgs["freqrows"] != null) {
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]);
                    Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\"");
                }
            }

            if (parsedArgs["emotion"] != null) {
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]);
                for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++)
                    Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]);
            }

            if (parsedArgs["emorows"] != null) {
                int rows = 0, valids = 0;
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    rows++;
                    double[] emotions = sensor.EstimateEmotions(row[1]);
                    Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\"");
                    if (!double.IsNaN(emotions[0]))
                        valids++;
                }
            }
        }

예제 #17

0

파일 보기

파일: Summarizer.cs 프로젝트: JohnSell620/FileSystemHelper

        public static string SummarizeByLSA(TextFile textFile)
        {
            string input = textFile.RawText;

            string[] sentences = input.Split(new char[] { '.', '!', '?', ':', '…', '\r', '\n' },
                                             StringSplitOptions.RemoveEmptyEntries);
            for (int i = 0; i < sentences.Length; ++i)
            {
                var    sb       = new StringBuilder();
                string sentence = sentences[i].Trim();
                foreach (char c in sentence)
                {
                    if (!char.IsPunctuation(c))
                    {
                        sb.Append(c);
                    }
                }
                sentences[i] = sb.ToString().ToLower();
            }

            // Remove stop words--e.g., the, and, a, etc.
            string[] stopwords = File.ReadAllLines(@"Resources/stopwords.txt");
            for (int i = 0; i < sentences.Count(); ++i)
            {
                string sentence = sentences[i];
                for (int j = 0; j < stopwords.Count(); ++j)
                {
                    sentences[i] = string.Join(" ", sentence.Split(' ').Where(wrd => !stopwords.Contains(wrd)));
                }
            }

            // Reduce words to their stem.
            PorterStemmer stemmer = new PorterStemmer();

            for (int i = 0; i < sentences.Count(); ++i)
            {
                sentences[i] = stemmer.StemWord(sentences[i]);
            }

            Dictionary <string, int> wordFrequencies = new Dictionary <string, int>();

            foreach (string s in sentences)
            {
                string[] words = s.Split(' ');
                foreach (string w in words)
                {
                    if (wordFrequencies.ContainsKey(w))
                    {
                        wordFrequencies[w] += 1;
                    }
                    else
                    {
                        wordFrequencies[w] = 1;
                    }
                }
            }

            // Top N words with highest frequencies will serve as document concepts.
            int N = textFile.DesiredSummaryLength;

            string[] concepts = (from kvp in wordFrequencies
                                 orderby kvp.Value descending
                                 select kvp)
                                .ToDictionary(pair => pair.Key, pair => pair.Value).Take(N)
                                .Select(k => k.Key).ToArray();

            // Add concepts to TextFile instance properties.
            textFile.DocumentConcepts = concepts;

            int documentLength = sentences.Length;
            var X = DenseMatrix.Create(N, documentLength, (i, j) => 0.0);

            for (int i = 0; i < X.RowCount; ++i)
            {
                int    sentencesWithConcept = 0;
                string concept = concepts[i];
                for (int j = 0; j < X.ColumnCount; ++j)
                {
                    string[] sentenceWords = sentences[j].Split(' ');
                    int      wordCount     = (from word in sentenceWords
                                              where word == concept
                                              select word)
                                             .Count();
                    if (wordCount > 0)
                    {
                        sentencesWithConcept += 1;
                    }

                    X[i, j] = wordCount / sentenceWords.Length;
                }
                if (sentencesWithConcept == 0)
                {
                    Console.WriteLine("No sentences with concept " + concepts[i]);
                }
                double inverseDocumentFreq = Math.Log(documentLength / (sentencesWithConcept + 0.0001), 2.0);
                for (int k = 0; k < X.ColumnCount; ++k)
                {
                    X[i, k] = X[i, k] * inverseDocumentFreq;
                }
            }

            // Compute SVD of the topic representation matrix, X.
            var svd = X.Svd();

            // Cross method to select summary sentences.
            int             columnCount = svd.VT.ColumnCount;
            Matrix <double> Vh          = svd.VT.SubMatrix(0, concepts.Length, 0, columnCount).PointwiseAbs();

            for (int i = 0; i < Vh.RowCount; ++i)
            {
                double averageSentenceScore = Vh.Row(i).Average();
                for (int j = 0; j < Vh.ColumnCount; ++j)
                {
                    if (Vh[i, j] <= averageSentenceScore)
                    {
                        Vh[i, j] = 0;
                    }
                }
            }

            var sentenceLengths = Vh.RowSums();

            int[] summaryIndices = new int[Vh.RowCount];
            Console.Write("Vh.RowCnt = ", Vh.RowCount);
            Console.Write("concepts.Length = ", concepts.Length);
            for (int i = 0; i < Vh.RowCount; ++i)
            {
                double max = 0;
                for (int j = 0; j < Vh.ColumnCount; ++j)
                {
                    if (Vh[i, j] > max)
                    {
                        max = Vh[i, j];
                        summaryIndices[i] = j;
                    }
                }
            }

            string[] sourceSentences = Regex.Split(input, @"(?<=[\.!\?])\s+");
            textFile.DocumentLength = sourceSentences.Length;
            string summary = "";

            foreach (int i in summaryIndices)
            {
                summary += sourceSentences[i] + " ";
            }

            /* From https://bit.ly/3ogjy2l */
            return(summary.Replace("\r\n", string.Empty)
                   .Replace("\n", string.Empty)
                   .Replace("\r", string.Empty)
                   .Replace("\t", string.Empty)
                   .Replace(((char)0x2028).ToString(), string.Empty)
                   .Replace(((char)0x2029).ToString(), string.Empty));
        }

예제 #18

0

파일 보기

 /// <summary>
 /// The constructor of the <see cref="WebCrawler"/>.
 /// </summary>
 /// <param name="config"></param>
 public WebCrawler(WebCrawlerConfig config)
 {
     this.config = config;
     stemmer     = new PorterStemmer();
 }

예제 #19

0

파일 보기

        // Constructors and finalizers:
        private Repository()
        {
            _assemblyName = Regex.Match(_assemblyFullName, "^(.*?),.*$").Result("$1");

            _rootDrive = ("/usr/project/xtmp/dp195/Poetix18/").Replace(@"\", Dsc);
            _nlpFolder = ("rhetorica/nlp/").Replace(@"\", Dsc);

            _openNlpModelsFolder = ("OpenNLP/models/").Replace(@"\", Dsc);
            _openNlpModelsPath   = RootDrive + _nlpFolder + _openNlpModelsFolder;

            _wordNetFolder = ("WordNet_3/").Replace(@"\", Dsc);
            _wordNetPath   = RootDrive + _nlpFolder + _wordNetFolder;

            _grammarFolder = ("StanfordParser/grammar/").Replace(@"\", Dsc);
            _grammarPath   = RootDrive + _nlpFolder + _grammarFolder;

            _dataFolder   = ("data/").Replace(@"\", Dsc);
            _nlpTextsPath = RootDrive + _dataFolder;

            string[] localTextDirectoryParts =
            {
                CurrentAssemblyDirectoryPath,
                "..",                        "..","..", "data"
                //"..", "..", "text"
            };
            _localTextPath = Path.Combine(localTextDirectoryParts) + "/"; // For development use

            // WordNet engine:
            Console.Write("Loading WordNet engine.... ");
            _wordNetEngine = new WordNetEngine(WordNetPath, true);
            Console.WriteLine("Done.");

            // OpenNLP sentence detector:
            Console.Write("Loading OpenNLP sentence detector.... ");
            java.io.FileInputStream modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-sent.bin");
            _sentenceModel = new SentenceModel(modelInputStream);
            modelInputStream.close();
            _sentenceDetector = new SentenceDetectorME(_sentenceModel);
            Console.WriteLine("Done.");

            // OpenNLP tokenizer:
            Console.Write("Loading OpenNLP tokenizer.... ");
            modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-token.bin");
            _tokenizerModel  = new opennlp.tools.tokenize.TokenizerModel(modelInputStream);
            modelInputStream.close();
            _tokenizer = new opennlp.tools.tokenize.TokenizerME(_tokenizerModel);
            Console.WriteLine("Done.");

            // OpenNLP name finder:
            Console.Write("Loading OpenNLP name finder.... ");
            modelInputStream      = new java.io.FileInputStream(OpenNlpModelsPath + "en-ner-person.bin");
            _tokenNameFinderModel = new TokenNameFinderModel(modelInputStream);
            modelInputStream.close();
            _nameFinder = new NameFinderME(_tokenNameFinderModel);
            Console.WriteLine("Done.");

            // OpenNLP POS tagger:
            Console.Write("Loading OpenNLP POS tagger.... ");
            modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-pos-maxent.bin");
            _posModel        = new POSModel(modelInputStream);
            modelInputStream.close();
            _tagger = new POSTaggerME(_posModel);
            Console.WriteLine("Done.");

            // OpenNLP chunker:
            Console.Write("Loading OpenNLP chunker.... ");
            modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-chunker.bin");
            _chunkerModel    = new ChunkerModel(modelInputStream);
            modelInputStream.close();
            _chunker = new ChunkerME(_chunkerModel);
            Console.WriteLine("Done.");

            // OpenNLP parser:
            if (_loadParser)
            {
                Console.Write("Loading OpenNLP parser.... ");
                modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-parser-chunking.bin");
                _parserModel     = new ParserModel(modelInputStream);
                modelInputStream.close();
                _parser = ParserFactory.create(_parserModel);
                Console.WriteLine("Done.");
            }

            // Stanford parser:
            //_stanfordParser = new LexicalizedParser(GrammarPath + "englishPCFG.ser.gz"); // Obsolete method
            _stanfordParser = LexicalizedParser.loadModel(GrammarPath + "englishPCFG.ser.gz");

            // Porter stemmer:
            _porterStemmer = new PorterStemmer();
        }

예제 #20

0

파일 보기

        static void Main(string[] args)
        {
            debug = false; // If set increases output of info
            string usage = "Usage options: \"verbose\" for additional output ";

            if (args.Length != 0)
            {
                switch (args[0])
                {
                case "verbose":
                    debug = true;
                    break;

                default:
                    Console.WriteLine(usage);
                    break;
                }
            }


            string output;

            string[]     tokens;
            StreamWriter outfile = new StreamWriter("dictionary.txt");

            Console.WriteLine("Getting list of files for processing");
            //*************************************
            // Get files from the current directory
            //*************************************
            string path = Directory.GetCurrentDirectory();

            string[] docx     = Directory.GetFiles(path, "*.docx");
            string[] pdf      = Directory.GetFiles(path, "*.pdf");
            string[] rtf      = Directory.GetFiles(path, "*.rtf");
            string[] docArray = docx.Concat(Directory.GetFiles(path, "*.pdf")).ToArray();
            docArray = docArray.Concat(rtf).ToArray();
            Array.Sort(docArray);   // Ensure sort order is maintained across the processing apps

            // list all docs found
            if (debug == true)
            {
                Console.WriteLine("We found the following list of files: ");
                foreach (var file in docArray)
                {
                    Console.WriteLine(file);
                }
            }
            Console.WriteLine("Total Files found:{0}", docArray.Length);

            Dictionary <string, int> dictionary = new Dictionary <string, int>();

            foreach (var file in docArray)
            {
                output = processdocument(file);
                if (fileopen == false)
                {
                    continue;
                }
                string fname = file;

                fname = file.Replace(".docx", ".txt").Replace(".pdf", ".txt").Replace(".rtf", ".txt");

                Console.WriteLine($"Writing file {fname} output...");
                System.IO.StreamWriter writefile = new System.IO.StreamWriter(fname, true); // Create output file same name .txt

                //*************
                // tokenization
                //*************
                char[] separators = { '_', ' ', ',', '.', '-', ':', ';', '{', '}', '|', '\n', '\t', '\u2029', '\r' };
                tokens = output.Split(separators, StringSplitOptions.RemoveEmptyEntries);
                var stemmer = new PorterStemmer();

                string stem; // Token after stemming
                foreach (var token in tokens)
                {
                    // Stem the current Token stemmed token left in stem

                    stem = stemmer.StemWord(token);
                    writefile.WriteLine(stem);

                    // The Add method throws an exception if the new key is
                    // already in the dictionary.
                    try
                    {
                        dictionary.Add(stem, 1);
                    }
                    catch (ArgumentException)
                    {
                        dictionary[stem] += 1;  // if exists increment the count
                    }
                }
                writefile.Close(); // added this becuase for resumes of less than a page the file was empty
            }

            // Now write out the dicionary to a text file
            if (fileopen == true)
            {
                foreach (var entry in dictionary)
                {
                    outfile.WriteLine("{0}, {1}", entry.Key, entry.Value);
                }
                outfile.Close(); // added this because for resumes of less than a page the dict was empty
            }
            Console.WriteLine("{0} Errors found", errorcount);
        }

예제 #21

0

파일 보기

파일: PorterStemmer.cs 프로젝트: ChristopherHaws/lucenenet

 private void copy_from(PorterStemmer other)
 {
     B_Y_found = other.B_Y_found;
     I_p2 = other.I_p2;
     I_p1 = other.I_p1;
     base.copy_from(other);
 }

예제 #22

0

파일 보기

파일: SnowballDict.cs 프로젝트: fan410577910/fan-site

        /// <summary>
        /// 获取词干对象(线程不安全)
        /// </summary>
        /// <param name="language"></param>
        /// <returns></returns>
        //public static SnowballProgram GetSnowball(string language)
        //{
        //    if (_dictSnowball.ContainsKey(language))
        //        return _dictSnowball[language];
        //    return null;
        //}
        /// <summary>
        /// 获取词干对象(线程安全)
        /// </summary>
        /// <param name="language"></param>
        /// <returns></returns>
        public static SnowballProgram GetSnowball(string language)
        {
            SnowballProgram result = null;

            switch (language)
            {
            case "DA":
                result = new DanishStemmer();
                break;    //丹麦语

            case "NL":
                result = new DutchStemmer();
                break;    //荷兰语

            case "EN":
                result = new EnglishStemmer();
                break;    //英语

            case "FI":
                result = new FinnishStemmer();
                break;    //芬兰语

            case "FR":
                result = new FrenchStemmer();
                break;    //法语

            case "DE2":
                result = new German2Stemmer();
                break;    //德语2

            case "DE":
                result = new GermanStemmer();
                break;    //德语

            case "HU":
                result = new HungarianStemmer();
                break;

            case "IT":
                result = new ItalianStemmer();
                break;

            case "文斯语":
                result = new LovinsStemmer();
                break;

            case "NO":
                result = new NorwegianStemmer();
                break;

            case "波特语":
                result = new PorterStemmer();
                break;    //英语的

            case "PT":
                result = new PortugueseStemmer();
                break;    //葡萄牙语

            case "RO":
                result = new RomanianStemmer();
                break;

            case "RU":
                result = new RussianStemmer();
                break;    //俄语

            case "ES":
                result = new SpanishStemmer();
                break;    //西班牙语

            case "SV":
                result = new SwedishStemmer();
                break;

            case "TR":
                result = new TurkishStemmer();
                break;    //土耳其语
            }
            return(result);
        }

예제 #23

0

파일 보기

파일: Program.cs 프로젝트: ahmedfathy74/Search-Engine

        static void Main(string[] args)
        {
            // start connection with database
            SqlConnection sqlConnection = new SqlConnection("Data Source=AHMEDFATHY-PC;Initial Catalog=newDB;Integrated Security=True; MultipleActiveResultSets=true");

            sqlConnection.Open();
            // select statment to retrieve everything from database
            string     queryString = "SELECT * FROM crawler_Table";
            SqlCommand cmd         = new SqlCommand(queryString, sqlConnection);
            // declare variable from reader to read from database (all the content from  database)
            SqlDataReader rdr = cmd.ExecuteReader();

            int counterofopages = 0;   // counter for number of pages that i read it from database (at least 1500)

            // datastructure to save term and doc_id and frequency and list of positions for this term
            List <KeyValuePair <string, KeyValuePair <int[], List <int> > > > indexmap = new List <KeyValuePair <string, KeyValuePair <int[], List <int> > > >();

            // while loop to read row by row from the reader
            while (rdr.Read())
            {
                // this condition to break from loop when take at least 1500 page
                if (counterofopages == 1600)
                {
                    break;
                }
                // try and catch to throw any exceptions out if it retreive null from innertext or something else

                int boolll = 0; // boolean to check if the inner text has exception change boolean = 1 and skip tha link
                try
                {
                    //===================================================//
                    // retreive from each row docid , url (link) , content of the page (html page)
                    int    doc_id  = (int)rdr["doc_id"];
                    string url     = (string)rdr["URL"];
                    string content = (string)rdr["Page_Content"];
                    //===================================================//

                    // pasre html page from database and get the inner text  (step 1)
                    IHTMLDocument2 myDoc = new HTMLDocumentClass();
                    myDoc.write(content);
                    string elements = myDoc.body.innerText;
                    //===================================================//
                    //(it will be)
                    /// split in (step 2) (to take tokens and save it in array of strings named (tokens)
                    string[] tokens = elements.Split(',', ' ', '.', ':', '\t', '\n', '\r');

                    int i = 0; // counter to calculate the position for every term

                    // check if any string it will be null or empty
                    tokens = tokens.Where(x => !string.IsNullOrEmpty(x)).ToArray();
                    //===================================================//

                    /// saves every term and its list (positions) (s in dictionary named (termsandpos) before removing stop words
                    Dictionary <string, List <int> > termsandpos = new Dictionary <string, List <int> >();
                    foreach (var words in tokens)
                    {
                        List <int> listofpos = new List <int>();
                        i++;
                        // using regex to remove punctuation characters from every word   (step 3) -> req 1
                        string word = Regex.Replace(words, @"[^\w\d\s]", "");
                        word = Regex.Replace(word, @"\d", "");
                        // if the word is empty after removing punctuation characters continues and don't save it
                        if (word == "")
                        {
                            continue;
                        }
                        // using spelling class from netspell reference and create object from it and using it to check if this word is real word in english or not.
                        Spelling ss = new Spelling();
                        // when the object from spelling class is used , the dialog window will opened and has many feature and i will closed by using next line to continue my run it's not used for my code.
                        ss.ShowDialog = false;
                        // check if this word is not found in dictionary in the spell library , continue ( go to the next word).
                        // esle continue the rest of the code (that is mean the word is found in the dictionary).
                        if (ss.SpellCheck(word))
                        {
                            continue;
                        }

                        word = word.ToLower(); //case folding in  (step 3) -> req 2

                        //If the word  is already existed ,add the new position in the list of this word
                        if (termsandpos.ContainsKey(word))
                        {
                            listofpos = termsandpos[word];
                            listofpos.Add(i);
                            termsandpos[word] = listofpos;
                        }
                        // else, add the word and the first position
                        else
                        {
                            listofpos.Add(i);
                            termsandpos.Add(word, listofpos);
                        }
                    }
                    //===================================================//

                    /////  stop words removing in (step 3) -> req 3
                    /// list of stop words
                    /// create anthor dictinary to copy all terms without stop words
                    Dictionary <string, List <int> > temp = new Dictionary <string, List <int> >();
                    List <string> stopwords = new List <string>()
                    {
                        "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the"
                    };
                    for (int f = 0; f < termsandpos.Count; f++)
                    {
                        // if the term is already existed in the stopwords list or the term is a single character like ii or i , continue (and go to the next term).
                        if (stopwords.Contains(termsandpos.Keys.ElementAt(f)) || termsandpos.Keys.ElementAt(f).Length <= 2)
                        {
                            continue;
                        }
                        // else ,that's mean the term is not a stop word then add it and its positions in the temp dictionary.
                        else
                        {
                            List <int> copyofpositions = new List <int>();
                            copyofpositions = termsandpos[termsandpos.Keys.ElementAt(f)];
                            temp.Add(termsandpos.Keys.ElementAt(f), copyofpositions);
                        }
                    }
                    //===================================================//

                    ////  al stemming algorithm            (step 3) --> req 4
                    var stemmer = new PorterStemmer();    // declare object from claas of porterstemmer algorithm
                    Dictionary <string, List <int> > finalterm = new Dictionary <string, List <int> >();
                    foreach (KeyValuePair <string, List <int> > iter1 in temp)
                    {
                        //===================================================//

                        // add every term and its docid in table called (TermsBStemming_Table) in db before stemming (the note in step 3 -->req 4)
                        string       insertString3 = "INSERT INTO TermsBStemming_Table (termBstemming,docID) VALUES (@termBstemming,@docID)";
                        SqlCommand   cmd3          = new SqlCommand(insertString3, sqlConnection);
                        SqlParameter par1          = new SqlParameter("@termBstemming", iter1.Key);
                        SqlParameter par2          = new SqlParameter("@docID", doc_id);
                        cmd3.Parameters.Add(par1);
                        cmd3.Parameters.Add(par2);
                        cmd3.ExecuteNonQuery();
                        //===================================================//

                        List <int> listofpositions = new List <int>();
                        // called function (StemWord) and send the term and return term after stemming
                        string stem = stemmer.StemWord(iter1.Key);
                        // check if this stem is already existed in finalterm dictionary (the new datastructure to save the term and its list after stemmnig)
                        if (finalterm.ContainsKey(stem))
                        {
                            List <int> tempforsimlir = new List <int>();
                            tempforsimlir   = finalterm[stem]; // take the list of positions for this term (old positions added before for this term)
                            listofpositions = temp[iter1.Key]; // take the list of new positions for this term
                            /// added the new positions and old position in one list
                            for (int j = 0; j < listofpositions.Count; j++)
                            {
                                tempforsimlir.Add(listofpositions[j]);
                            }
                            // and save it again for the term
                            finalterm[stem] = tempforsimlir;
                        }
                        // addd the term ans its list to finalterm dictionary
                        else
                        {
                            listofpositions = temp[iter1.Key];
                            finalterm.Add(stem, listofpositions);
                        }
                    }

                    //===================================================//

                    ////  inverted index (step 4)

                    foreach (KeyValuePair <string, List <int> > iter in finalterm)
                    {
                        int   freq = iter.Value.Count; // calculate freq through count number of positions
                        int[] arr  = new int[2];       // save in this array doc id and the frequency
                        arr[0] = doc_id;
                        arr[1] = freq;
                        // convert list of the positions for every term to string
                        var resultofpositions = string.Join(", ", iter.Value);
                        //===================================================//

                        // save term and docid ans=d frequency and (list of positions as string ) in table called Inverted_Index in db.
                        string       insertString2 = "INSERT INTO Inverted_Index (Term,DocID,Frequency,position) VALUES (@Term,@DocID,@Frequency,@position)";
                        SqlCommand   cmd2          = new SqlCommand(insertString2, sqlConnection);
                        SqlParameter paramter1     = new SqlParameter("@Term", iter.Key);
                        SqlParameter paramter2     = new SqlParameter("@DocID", doc_id);
                        SqlParameter paramter3     = new SqlParameter("@Frequency", freq);
                        SqlParameter paramter4     = new SqlParameter("@position", resultofpositions);
                        cmd2.Parameters.Add(paramter1);
                        cmd2.Parameters.Add(paramter2);
                        cmd2.Parameters.Add(paramter3);
                        cmd2.Parameters.Add(paramter4);
                        cmd2.ExecuteNonQuery();
                        //===================================================//
                        /// store in index list term and arrof ints (arr[0]=docid,arr[1] = freqs of every term) and list of all positions of this term (if i needed in ranks or something else).
                        indexmap.Add(new KeyValuePair <string, KeyValuePair <int[], List <int> > >(iter.Key, new KeyValuePair <int[], List <int> >(arr, iter.Value)));
                    }

                    //===================================================//
                }
                //===================================================//
                //catch any type of exception and change the boolean that i decalred equal zero
                catch (NullReferenceException ex)
                {
                    boolll = 1;
                    Console.WriteLine(ex.Message);
                }
                catch (ArgumentOutOfRangeException exx)
                {
                    boolll = 1;
                    Console.WriteLine(exx.Message);
                }
                // if the boolean became equal 1 , then leave this link and go to anthor link
                if (boolll == 1)
                {
                    continue;
                }

                //===================================================//
                /// to count number of pages (at least 1500 page)
                counterofopages++;
                //===================================================//
            }
            //===================================================//
            // close the reader from database
            rdr.Close();
            /// close the connection
            sqlConnection.Close();
            //===================================================//
        }

예제 #24

0

파일 보기

파일: Main.cs 프로젝트: killix/Virsona-ChatBot-Tools

        public static void Main(string[] args)
        {
            ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass());

            PorterStemmer stemmer = new PorterStemmer();

            if (parsedArgs["stem"] != null)
                Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"]));

            /*ANEWEmotionSensor sensor2 = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
            for (int rr = 0; rr < sensor2.positiveMatrix.GetLength(0); rr++) {
                for (int cc = 0; cc < sensor2.positiveMatrix.GetLength(1); cc++)
                    Console.Write(sensor2.positiveMatrix[rr, cc] + ", ");
                Console.WriteLine(" - ");
            }
            for (int rr = 0; rr < sensor2.negativeMatrix.GetLength(0); rr++) {
                for (int cc = 0; cc < sensor2.negativeMatrix.GetLength(1); cc++)
                    Console.Write(sensor2.negativeMatrix[rr, cc] + ", ");
                Console.WriteLine(" - ");
            }
            return;*/

            if (parsedArgs["freqrows"] != null) {
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]);
                    Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\"");
                }
            }

            if (parsedArgs["emotion"] != null) {
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]);
                for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++)
                    Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]);
            }

            if (parsedArgs["emorows"] != null) {
                int rows = 0, valids = 0;
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");
                DataReader reader = new DataReader(parsedArgs["f"]);
                for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                    rows++;
                    double[] emotions = sensor.EstimateEmotions(row[1]);
                    Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\"");
                    if (!double.IsNaN(emotions[0]))
                        valids++;
                }
            }

            if (parsedArgs["eimpute"] != null) {
                ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data");

                // DIAGNOSTIC
                /*List<List<string>> rows = new List<List<string>>();
                rows.Add(TwitterUtilities.SplitWords("happy aaaa cccc"));
                rows.Add(TwitterUtilities.SplitWords("sad bbbb cccc"));

                IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 1000);
                foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed)
                    Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean);*/

                bool smallFile = false;
                if (smallFile) {
                    DataReader reader = new DataReader(parsedArgs["f"]);
                    List<List<string>> rows = new List<List<string>>();
                    for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                        Console.WriteLine(row);
                        rows.Add(TwitterUtilities.SplitWords(row[10].ToLower()));
                    }
                    reader.Close();

                    /*IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 10);
                    double minv = 1, maxv = 0;
                    foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed) {
                        minv = Math.Min(minv, kvp.Value.one.Mean);
                        maxv = Math.Max(maxv, kvp.Value.one.Mean);
                        Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + " x " + kvp.Value.one.Variance + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean);
                    }

                    Console.WriteLine("Min: " + minv + ", Max: " + maxv);*/

                    sensor.ImputeEmotionalContent(rows, 10, parsedArgs["f"] + "imputed");
                } else {
                    sensor.ImputeEmotionalContentFromFile(parsedArgs["f"], 11, 0, parsedArgs["f"].Substring(0, parsedArgs["f"].Length - 4) + "imputed.csv");
                }

                uint jj = 0;
                using (var stream = File.CreateText(parsedArgs["f"] + "result")) {
                    jj++;
                    if (jj % 1000 == 0)
                        Console.WriteLine("#" + jj);

                    DataReader reader = new DataReader(parsedArgs["f"]);
                    for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) {
                        double[] emotions = sensor.EstimateEmotions(row[11]);
                        for (int ii = 0; ii < 11; ii++)
                            stream.Write(row[ii] + ",");
                        stream.WriteLine(emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7]);
                    }
                }
            }
        }

예제 #25

0

파일 보기

파일: PorterStemmerTests.cs 프로젝트: deniskyashif/ranked-search

        public void InvokingTheConstructor_WithNoArguments_ShouldReturnNewInstance()
        {
            var stemmer = new PorterStemmer();

            Assert.IsInstanceOfType(stemmer, typeof(PorterStemmer));
        }

C# (CSharp) PorterStemmer 예제들