public void Stem(string word, string wordExpected) { // Arrange var stemmer = new PorterStemmer(); // Act var wordStemmed = stemmer.Stem(word); // Assert Assert.AreEqual(wordExpected, wordStemmed); }
public LinkedList <IndexEntry> Execute(string query, CrawlerRegistry registry, int maxResults = 25, bool usePageRank = false) { var tokens = new List <string>(query.ToLower().Split(' ')); tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token)); var stemmer = new PorterStemmer(); var stemmedTokens = new HashSet <string>(tokens.Select(token => stemmer.StemWord(token.ToLower()))); return(this.Execute(stemmedTokens.ToList(), registry, maxResults, usePageRank)); }
public void TestFromPorter() { var stemmer = new PorterStemmer("en"); foreach (var pair in dic) { var stem = stemmer.Stem(pair.Key); Assert.AreEqual(pair.Value, stem); } }
public QueryPart ParseQuery(string input) { var tokens = new List <string>(input.ToLower().Split(' ')); tokens.RemoveAll(token => token.Length <= 1 || StopWords.BooleanStopWordsList.Contains(token)); var stemmer = new PorterStemmer(); var stemmedTokens = new List <string>(tokens.Select(token => StopWords.BooleanWords.Contains(token.ToLower()) ? token.ToUpper() : stemmer.StemWord(token.ToLower()))); return(this.ParseQuery(stemmedTokens)); }
//inverted index public Dictionary <string, Dictionary <string, double> > InvertedIndex(string folder) { converter = new Converter(); if (internalIndex != null) { internalIndex.Clear(); } // clears the memory usage of exisitng Index internalIndex = new Dictionary <string, Dictionary <string, double> >(); // the invertedIndex to be returned indexCount = 0; // a counter for how large the inverted index is. Dictionary <string, double> fileList = new Dictionary <string, double>(); // a list to populate the files that match a term PorterStemmer stemmer = new PorterStemmer(); // instantiate a PorterStemmer object to stem words from files foreach (string file in IndexingFolders(folder)) { //int fileID = converter.assignID(file); // create an Id from the string of the file and store in HashMap Converter.paths foreach (string word in ScanFiles.scanFiles(file)) { // stem the word string stemmedWord = stemmer.StemWord(word); // create the Dictionary for the collection if (internalIndex.ContainsKey(stemmedWord)) { fileList = internalIndex[stemmedWord]; // check if the file is already in the list or not if (fileList.ContainsKey(file)) { fileList[file] = double.Parse(fileList[file].ToString()) + 1; } else { fileList.Add(file, 1.0); } internalIndex[stemmedWord] = fileList; } else { // create a new key and start new List of files for the key fileList = new Dictionary <string, double> { { file, 1.0 } }; internalIndex.Add(stemmedWord, fileList); indexCount++; } } } return(internalIndex); }
public void Test_StemWordOutPut_Matches_StaticOutput() { string filepath = Path.GetDirectoryName(System.AppDomain.CurrentDomain.BaseDirectory); filepath = Directory.GetParent(Directory.GetParent(Directory.GetParent(filepath).FullName).FullName).FullName; List <string> StaticOutput = new List <string>(); List <string> TestOutput = new List <string>(); try { string staticOutputPath = filepath + @"\StemmerTestFiles\OutputWords.txt"; using (var stream = new StreamReader(staticOutputPath)) { string line = stream.ReadLine(); while (line != null) { StaticOutput.Add(line); line = stream.ReadLine(); } } } catch (IOException e) { Console.WriteLine("The file could not be read:"); Console.WriteLine(e.Message); } PorterStemmer ps = new PorterStemmer(); try { string staticOutputPath = filepath + @"\StemmerTestFiles\RawWords.txt"; using (var stream = new StreamReader(staticOutputPath)) { string line = stream.ReadLine(); while (line != null) { TestOutput.Add(ps.StemWord(line)); line = stream.ReadLine(); } } } catch (IOException e) { Console.WriteLine("The file could not be read:"); Console.WriteLine(e.Message); } Assert.IsTrue(StaticOutput.SequenceEqual(TestOutput)); }
/// <summary> /// Transforms the list of terms into their root form. /// Uses the Porter Stemming alogorithm. /// </summary> /// <param name="wordList">List of terms to edit</param> /// <returns></returns> public static List <string> StemWords(List <string> wordList) { var stemmer = new PorterStemmer(); List <string> stemmedList = new List <string>(); foreach (var item in wordList) { stemmedList.Add(stemmer.StemWord(item)); } return(stemmedList); }
public static List <DocumentResult> Search(string input) { // Take the input, split up into words while discarding symbols and numbers, then remove the stop words and set all cases to lowercase var tokens = input.Split(" ", StringSplitOptions.RemoveEmptyEntries) .Select(x => (x.Where(c => char.IsLetter(c)).Aggregate("", (current, c) => current + c)).ToLower()); tokens = tokens.Where(x => !BLL.Constants.stopwords.Contains(x)).Distinct(); var stemmedTokens = new List <string>(); // Instantiate the stemmer PorterStemmer stem = new PorterStemmer(); // Stem all the words in the input and add to the list foreach (var word in tokens) { stem.SetCurrent(word); stem.Stem(); var result = stem.Current; stemmedTokens.Add(result); } // just in case some words have common stems, we apply the Distinct filter again var words = stemmedTokens.Distinct(); // Get all word ids of cleaned token list var wordIds = _context.Word.Where(x => words.Contains(x.Word1)).Select(x => x.WordId).ToList(); // Generate list od DocumentIds based on words and get the top 10 var pull = _context.DocumentWord.ToList(); var svd = pull.Where(x => wordIds.Contains(x.WordId)); var totalcounts = _context.DocumentWord.ToList().GroupBy(g => g.DocumentId).ToDictionary(x => x.Key, x => x.Sum(z => z.Count)); var counts = svd.GroupBy(g => g.DocumentId).Select(y => new { DocumentId = y.Key, Counts = totalcounts[y.Key] != 0 ? y.Sum(z => z.Count) / totalcounts[y.Key] : 0 }); var top10 = counts.OrderByDescending(c => c.Counts).Take(10); var documentIds = top10.Select(x => x.DocumentId).ToList(); List <int> filteredDocs = _context.Document.Where(x => documentIds.Contains(x.DocumentId)).ToList().Select(x => x.DocumentId).ToList(); var subtitles = _context.Subtitle.Select(x => new { x.SubtitleId, x.SubtitleName, x.SubtitleNumber }).ToDictionary(x => x.SubtitleId, x => new { x.SubtitleName, x.SubtitleNumber }); List <DocumentResult> documents = _context.Document.Where(x => filteredDocs.Contains(x.DocumentId)).Include(j => j.Title).Select(y => new DocumentResult { DocumentText = y.DocumentText, SubtitleName = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleName : "", SubtitleNumber = y.SubtitleId.HasValue ? subtitles[y.SubtitleId.Value].SubtitleNumber : "", TitleName = y.Title.TitleName, TitleNumber = y.Title.TitleNumber, Citation = y.UniversalCitation, DocumentHeader = y.DocumentHeader }).ToList(); return(documents); }
/// <summary> /// 52. ステミング /// 51の出力を入力として受け取り,Porterのステミングアルゴリズムを適用し,単語と語幹をタブ区切り形式で出力せよ. Pythonでは,Porterのステミングアルゴリズムの実装としてstemmingモジュールを利用するとよい. /// </summary> public void Answer52() { PorterStemmer porterStemmer = new PorterStemmer(); foreach (var sentence in SplitSentence()) { foreach (var word in SplitWords(sentence)) { var stem = porterStemmer.StemWord(word); Console.WriteLine($"{word}\t{stem}"); } Console.WriteLine(); } }
public HashSet <string> GetNGram(string text) { // get all significant words var words = Regex.Split(Clean(text), $@"[ \n\t\r$+<>№=]"); // remove endings of words for (int i = 0; i < words.Length; i++) { words[i] = PorterStemmer.TransformingWord(words[i]); } var uniqueValues = new HashSet <string>(words); uniqueValues.RemoveWhere((s) => s.Equals("")); return(uniqueValues); }
public async Task FindDuplicates(string text) { // Arrange var stemmer = new PorterStemmer(); var textPreprocessor = new TextPreprocessor(stemmer); var document = new Document { Id = 3, Tokens = textPreprocessor.Tokenize(text) }; var tfIdfSimilarityScoring = await BuildService(textPreprocessor); // Act var scores = await tfIdfSimilarityScoring.GetSimilarityScoresAsync(document); // Assert Assert.IsTrue(scores.Any(s => s.Score > 0.5d)); }
public ANEWEmotionSensor(string datadirectory) { // Data files contained in [datadrectory]/metrics string metricsdir = datadirectory + Path.DirectorySeparatorChar + "metrics" + Path.DirectorySeparatorChar; source = new MemoizedSource <string, ThreeTuple <ContinuousDistribution, ContinuousDistribution, ContinuousDistribution> >(new ANEWFileSource(metricsdir + "anew.csv")); stemmer = new PorterStemmer(); // These matrices are used in G emotion = vad // positives are mean > 5; negatives are mean < 5 double[,] positives = new double[, ] { { .890, -.020, -.110, .116, -.035 }, { .649, .139, -.287, .441, .051 }, { .601, .153, -.305, .125, .042 } }; double[,] positiveTs = new double[, ] { { 45.40, 0.73, 4.24, 4.95, 1.55 }, { 19.75, 2.984, 6.57, 11.26, 1.36 }, { 16.60, 2.98, 6.34, 2.88, 1.00 } }; double[,] positiveSEs = Matrix.ElementwiseDivide(positives, positiveTs); double[,] negatives = new double[, ] { { .291, -.044, -.515, .020, -.243 }, { .050, .492, -.309, .670, -.042 }, { .136, .369, -.625, -.144, .041 } }; double[,] negativeTs = new double[, ] { { 8.91, 1.27, 13.80, 0.58, 8.27 }, { 1.36, 12.59, 7.33, 17.11, 1.27 }, { 2.93, 7.49, 11.75, 2.91, 0.98 } }; double[,] negativeSEs = Matrix.ElementwiseDivide(negatives, negativeTs); ContinuousDistribution[,] randomPositives = RandomMatrix.MakeGaussians(positives, positiveSEs); ContinuousDistribution[,] randomNegatives = RandomMatrix.MakeGaussians(negatives, negativeSEs); positiveProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomPositives), RandomMatrix.Inverse(RandomMatrix.Multiply(randomPositives, RandomMatrix.Transpose(randomPositives)))); negativeProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomNegatives), RandomMatrix.Inverse(RandomMatrix.Multiply(randomNegatives, RandomMatrix.Transpose(randomNegatives)))); positiveMatrix = Matrix.Multiply(Matrix.Transpose(positives), Matrix.Inverse(Matrix.Multiply(positives, Matrix.Transpose(positives)))); negativeMatrix = Matrix.Multiply(Matrix.Transpose(negatives), Matrix.Inverse(Matrix.Multiply(negatives, Matrix.Transpose(negatives)))); }
private void searchButton_Click(object sender, EventArgs e) { PorterStemmer stemmer = new PorterStemmer(); // instantiate a PorterStemmer object to stem words from files string stemmedWord = stemmer.StemWord(searchWord.Text); //stems the word before searching fileList.Text = " "; filesFound.Text = " "; List <string> files = new List <string>(); bool found = false; if (thread.IsAlive) { MessageBox.Show("The index is currently busy. Please try again later"); //message shown if index building is in progress } else { foreach (var item in index.internalIndex) { string newItem = StopWords.RemoveStopwords(item.Key); if (newItem == stemmedWord) { found = true; filesFound.Text = item.Value.Count.ToString(); foreach (var folderName in item.Value.Keys) { files.Add(folderName); } } } foreach (var file in files) { fileList.Text += file + "\r\n"; } if (!found) { filesFound.Text = "0"; fileList.Text = "No results found"; } } }
public static string Process(string textToProcess) { StringBuilder builder = new StringBuilder(); string result = string.Empty; string stemmedWord; char[] delimiterChars = { ' ' }; string[] tokens = textToProcess.Split(delimiterChars); StemmerInterface porterStemmer = new PorterStemmer(); foreach (string token in tokens) { stemmedWord = porterStemmer.stemTerm(token); builder.AppendFormat("{0} ", stemmedWord); } result = builder.ToString().Trim(); return(result); }
public ANEWEmotionSensor(string datadirectory) { // Data files contained in [datadrectory]/metrics string metricsdir = datadirectory + Path.DirectorySeparatorChar + "metrics" + Path.DirectorySeparatorChar; source = new MemoizedSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>>(new ANEWFileSource(metricsdir + "anew.csv")); stemmer = new PorterStemmer(); // These matrices are used in G emotion = vad // positives are mean > 5; negatives are mean < 5 double[,] positives = new double[,] { {.890, -.020, -.110, .116, -.035}, {.649, .139, -.287, .441, .051}, {.601, .153, -.305, .125, .042}}; double[,] positiveTs = new double[,] { {45.40, 0.73, 4.24, 4.95, 1.55}, {19.75, 2.984, 6.57, 11.26, 1.36}, {16.60, 2.98, 6.34, 2.88, 1.00}}; double[,] positiveSEs = Matrix.ElementwiseDivide(positives, positiveTs); double[,] negatives = new double[,] { {.291, -.044, -.515, .020, -.243}, {.050, .492, -.309, .670, -.042}, {.136, .369, -.625, -.144, .041}}; double[,] negativeTs = new double[,] { {8.91, 1.27, 13.80, 0.58, 8.27}, {1.36, 12.59, 7.33, 17.11, 1.27}, {2.93, 7.49, 11.75, 2.91, 0.98}}; double[,] negativeSEs = Matrix.ElementwiseDivide(negatives, negativeTs); ContinuousDistribution[,] randomPositives = RandomMatrix.MakeGaussians(positives, positiveSEs); ContinuousDistribution[,] randomNegatives = RandomMatrix.MakeGaussians(negatives, negativeSEs); positiveProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomPositives), RandomMatrix.Inverse(RandomMatrix.Multiply(randomPositives, RandomMatrix.Transpose(randomPositives)))); negativeProduct = RandomMatrix.Multiply(RandomMatrix.Transpose(randomNegatives), RandomMatrix.Inverse(RandomMatrix.Multiply(randomNegatives, RandomMatrix.Transpose(randomNegatives)))); positiveMatrix = Matrix.Multiply(Matrix.Transpose(positives), Matrix.Inverse(Matrix.Multiply(positives, Matrix.Transpose(positives)))); negativeMatrix = Matrix.Multiply(Matrix.Transpose(negatives), Matrix.Inverse(Matrix.Multiply(negatives, Matrix.Transpose(negatives)))); }
public static void Main(string[] args) { ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass()); PorterStemmer stemmer = new PorterStemmer(); if (parsedArgs["stem"] != null) Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"])); if (parsedArgs["freqrows"] != null) { DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]); Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\""); } } if (parsedArgs["emotion"] != null) { ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]); for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++) Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]); } if (parsedArgs["emorows"] != null) { int rows = 0, valids = 0; ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { rows++; double[] emotions = sensor.EstimateEmotions(row[1]); Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\""); if (!double.IsNaN(emotions[0])) valids++; } } }
public static string SummarizeByLSA(TextFile textFile) { string input = textFile.RawText; string[] sentences = input.Split(new char[] { '.', '!', '?', ':', '…', '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < sentences.Length; ++i) { var sb = new StringBuilder(); string sentence = sentences[i].Trim(); foreach (char c in sentence) { if (!char.IsPunctuation(c)) { sb.Append(c); } } sentences[i] = sb.ToString().ToLower(); } // Remove stop words--e.g., the, and, a, etc. string[] stopwords = File.ReadAllLines(@"Resources/stopwords.txt"); for (int i = 0; i < sentences.Count(); ++i) { string sentence = sentences[i]; for (int j = 0; j < stopwords.Count(); ++j) { sentences[i] = string.Join(" ", sentence.Split(' ').Where(wrd => !stopwords.Contains(wrd))); } } // Reduce words to their stem. PorterStemmer stemmer = new PorterStemmer(); for (int i = 0; i < sentences.Count(); ++i) { sentences[i] = stemmer.StemWord(sentences[i]); } Dictionary <string, int> wordFrequencies = new Dictionary <string, int>(); foreach (string s in sentences) { string[] words = s.Split(' '); foreach (string w in words) { if (wordFrequencies.ContainsKey(w)) { wordFrequencies[w] += 1; } else { wordFrequencies[w] = 1; } } } // Top N words with highest frequencies will serve as document concepts. int N = textFile.DesiredSummaryLength; string[] concepts = (from kvp in wordFrequencies orderby kvp.Value descending select kvp) .ToDictionary(pair => pair.Key, pair => pair.Value).Take(N) .Select(k => k.Key).ToArray(); // Add concepts to TextFile instance properties. textFile.DocumentConcepts = concepts; int documentLength = sentences.Length; var X = DenseMatrix.Create(N, documentLength, (i, j) => 0.0); for (int i = 0; i < X.RowCount; ++i) { int sentencesWithConcept = 0; string concept = concepts[i]; for (int j = 0; j < X.ColumnCount; ++j) { string[] sentenceWords = sentences[j].Split(' '); int wordCount = (from word in sentenceWords where word == concept select word) .Count(); if (wordCount > 0) { sentencesWithConcept += 1; } X[i, j] = wordCount / sentenceWords.Length; } if (sentencesWithConcept == 0) { Console.WriteLine("No sentences with concept " + concepts[i]); } double inverseDocumentFreq = Math.Log(documentLength / (sentencesWithConcept + 0.0001), 2.0); for (int k = 0; k < X.ColumnCount; ++k) { X[i, k] = X[i, k] * inverseDocumentFreq; } } // Compute SVD of the topic representation matrix, X. var svd = X.Svd(); // Cross method to select summary sentences. int columnCount = svd.VT.ColumnCount; Matrix <double> Vh = svd.VT.SubMatrix(0, concepts.Length, 0, columnCount).PointwiseAbs(); for (int i = 0; i < Vh.RowCount; ++i) { double averageSentenceScore = Vh.Row(i).Average(); for (int j = 0; j < Vh.ColumnCount; ++j) { if (Vh[i, j] <= averageSentenceScore) { Vh[i, j] = 0; } } } var sentenceLengths = Vh.RowSums(); int[] summaryIndices = new int[Vh.RowCount]; Console.Write("Vh.RowCnt = ", Vh.RowCount); Console.Write("concepts.Length = ", concepts.Length); for (int i = 0; i < Vh.RowCount; ++i) { double max = 0; for (int j = 0; j < Vh.ColumnCount; ++j) { if (Vh[i, j] > max) { max = Vh[i, j]; summaryIndices[i] = j; } } } string[] sourceSentences = Regex.Split(input, @"(?<=[\.!\?])\s+"); textFile.DocumentLength = sourceSentences.Length; string summary = ""; foreach (int i in summaryIndices) { summary += sourceSentences[i] + " "; } /* From https://bit.ly/3ogjy2l */ return(summary.Replace("\r\n", string.Empty) .Replace("\n", string.Empty) .Replace("\r", string.Empty) .Replace("\t", string.Empty) .Replace(((char)0x2028).ToString(), string.Empty) .Replace(((char)0x2029).ToString(), string.Empty)); }
/// <summary> /// The constructor of the <see cref="WebCrawler"/>. /// </summary> /// <param name="config"></param> public WebCrawler(WebCrawlerConfig config) { this.config = config; stemmer = new PorterStemmer(); }
// Constructors and finalizers: private Repository() { _assemblyName = Regex.Match(_assemblyFullName, "^(.*?),.*$").Result("$1"); _rootDrive = ("/usr/project/xtmp/dp195/Poetix18/").Replace(@"\", Dsc); _nlpFolder = ("rhetorica/nlp/").Replace(@"\", Dsc); _openNlpModelsFolder = ("OpenNLP/models/").Replace(@"\", Dsc); _openNlpModelsPath = RootDrive + _nlpFolder + _openNlpModelsFolder; _wordNetFolder = ("WordNet_3/").Replace(@"\", Dsc); _wordNetPath = RootDrive + _nlpFolder + _wordNetFolder; _grammarFolder = ("StanfordParser/grammar/").Replace(@"\", Dsc); _grammarPath = RootDrive + _nlpFolder + _grammarFolder; _dataFolder = ("data/").Replace(@"\", Dsc); _nlpTextsPath = RootDrive + _dataFolder; string[] localTextDirectoryParts = { CurrentAssemblyDirectoryPath, "..", "..","..", "data" //"..", "..", "text" }; _localTextPath = Path.Combine(localTextDirectoryParts) + "/"; // For development use // WordNet engine: Console.Write("Loading WordNet engine.... "); _wordNetEngine = new WordNetEngine(WordNetPath, true); Console.WriteLine("Done."); // OpenNLP sentence detector: Console.Write("Loading OpenNLP sentence detector.... "); java.io.FileInputStream modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-sent.bin"); _sentenceModel = new SentenceModel(modelInputStream); modelInputStream.close(); _sentenceDetector = new SentenceDetectorME(_sentenceModel); Console.WriteLine("Done."); // OpenNLP tokenizer: Console.Write("Loading OpenNLP tokenizer.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-token.bin"); _tokenizerModel = new opennlp.tools.tokenize.TokenizerModel(modelInputStream); modelInputStream.close(); _tokenizer = new opennlp.tools.tokenize.TokenizerME(_tokenizerModel); Console.WriteLine("Done."); // OpenNLP name finder: Console.Write("Loading OpenNLP name finder.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-ner-person.bin"); _tokenNameFinderModel = new TokenNameFinderModel(modelInputStream); modelInputStream.close(); _nameFinder = new NameFinderME(_tokenNameFinderModel); Console.WriteLine("Done."); // OpenNLP POS tagger: Console.Write("Loading OpenNLP POS tagger.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-pos-maxent.bin"); _posModel = new POSModel(modelInputStream); modelInputStream.close(); _tagger = new POSTaggerME(_posModel); Console.WriteLine("Done."); // OpenNLP chunker: Console.Write("Loading OpenNLP chunker.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-chunker.bin"); _chunkerModel = new ChunkerModel(modelInputStream); modelInputStream.close(); _chunker = new ChunkerME(_chunkerModel); Console.WriteLine("Done."); // OpenNLP parser: if (_loadParser) { Console.Write("Loading OpenNLP parser.... "); modelInputStream = new java.io.FileInputStream(OpenNlpModelsPath + "en-parser-chunking.bin"); _parserModel = new ParserModel(modelInputStream); modelInputStream.close(); _parser = ParserFactory.create(_parserModel); Console.WriteLine("Done."); } // Stanford parser: //_stanfordParser = new LexicalizedParser(GrammarPath + "englishPCFG.ser.gz"); // Obsolete method _stanfordParser = LexicalizedParser.loadModel(GrammarPath + "englishPCFG.ser.gz"); // Porter stemmer: _porterStemmer = new PorterStemmer(); }
static void Main(string[] args) { debug = false; // If set increases output of info string usage = "Usage options: \"verbose\" for additional output "; if (args.Length != 0) { switch (args[0]) { case "verbose": debug = true; break; default: Console.WriteLine(usage); break; } } string output; string[] tokens; StreamWriter outfile = new StreamWriter("dictionary.txt"); Console.WriteLine("Getting list of files for processing"); //************************************* // Get files from the current directory //************************************* string path = Directory.GetCurrentDirectory(); string[] docx = Directory.GetFiles(path, "*.docx"); string[] pdf = Directory.GetFiles(path, "*.pdf"); string[] rtf = Directory.GetFiles(path, "*.rtf"); string[] docArray = docx.Concat(Directory.GetFiles(path, "*.pdf")).ToArray(); docArray = docArray.Concat(rtf).ToArray(); Array.Sort(docArray); // Ensure sort order is maintained across the processing apps // list all docs found if (debug == true) { Console.WriteLine("We found the following list of files: "); foreach (var file in docArray) { Console.WriteLine(file); } } Console.WriteLine("Total Files found:{0}", docArray.Length); Dictionary <string, int> dictionary = new Dictionary <string, int>(); foreach (var file in docArray) { output = processdocument(file); if (fileopen == false) { continue; } string fname = file; fname = file.Replace(".docx", ".txt").Replace(".pdf", ".txt").Replace(".rtf", ".txt"); Console.WriteLine($"Writing file {fname} output..."); System.IO.StreamWriter writefile = new System.IO.StreamWriter(fname, true); // Create output file same name .txt //************* // tokenization //************* char[] separators = { '_', ' ', ',', '.', '-', ':', ';', '{', '}', '|', '\n', '\t', '\u2029', '\r' }; tokens = output.Split(separators, StringSplitOptions.RemoveEmptyEntries); var stemmer = new PorterStemmer(); string stem; // Token after stemming foreach (var token in tokens) { // Stem the current Token stemmed token left in stem stem = stemmer.StemWord(token); writefile.WriteLine(stem); // The Add method throws an exception if the new key is // already in the dictionary. try { dictionary.Add(stem, 1); } catch (ArgumentException) { dictionary[stem] += 1; // if exists increment the count } } writefile.Close(); // added this becuase for resumes of less than a page the file was empty } // Now write out the dicionary to a text file if (fileopen == true) { foreach (var entry in dictionary) { outfile.WriteLine("{0}, {1}", entry.Key, entry.Value); } outfile.Close(); // added this because for resumes of less than a page the dict was empty } Console.WriteLine("{0} Errors found", errorcount); }
private void copy_from(PorterStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; base.copy_from(other); }
/// <summary> /// 获取词干对象(线程不安全) /// </summary> /// <param name="language"></param> /// <returns></returns> //public static SnowballProgram GetSnowball(string language) //{ // if (_dictSnowball.ContainsKey(language)) // return _dictSnowball[language]; // return null; //} /// <summary> /// 获取词干对象(线程安全) /// </summary> /// <param name="language"></param> /// <returns></returns> public static SnowballProgram GetSnowball(string language) { SnowballProgram result = null; switch (language) { case "DA": result = new DanishStemmer(); break; //丹麦语 case "NL": result = new DutchStemmer(); break; //荷兰语 case "EN": result = new EnglishStemmer(); break; //英语 case "FI": result = new FinnishStemmer(); break; //芬兰语 case "FR": result = new FrenchStemmer(); break; //法语 case "DE2": result = new German2Stemmer(); break; //德语2 case "DE": result = new GermanStemmer(); break; //德语 case "HU": result = new HungarianStemmer(); break; case "IT": result = new ItalianStemmer(); break; case "文斯语": result = new LovinsStemmer(); break; case "NO": result = new NorwegianStemmer(); break; case "波特语": result = new PorterStemmer(); break; //英语的 case "PT": result = new PortugueseStemmer(); break; //葡萄牙语 case "RO": result = new RomanianStemmer(); break; case "RU": result = new RussianStemmer(); break; //俄语 case "ES": result = new SpanishStemmer(); break; //西班牙语 case "SV": result = new SwedishStemmer(); break; case "TR": result = new TurkishStemmer(); break; //土耳其语 } return(result); }
static void Main(string[] args) { // start connection with database SqlConnection sqlConnection = new SqlConnection("Data Source=AHMEDFATHY-PC;Initial Catalog=newDB;Integrated Security=True; MultipleActiveResultSets=true"); sqlConnection.Open(); // select statment to retrieve everything from database string queryString = "SELECT * FROM crawler_Table"; SqlCommand cmd = new SqlCommand(queryString, sqlConnection); // declare variable from reader to read from database (all the content from database) SqlDataReader rdr = cmd.ExecuteReader(); int counterofopages = 0; // counter for number of pages that i read it from database (at least 1500) // datastructure to save term and doc_id and frequency and list of positions for this term List <KeyValuePair <string, KeyValuePair <int[], List <int> > > > indexmap = new List <KeyValuePair <string, KeyValuePair <int[], List <int> > > >(); // while loop to read row by row from the reader while (rdr.Read()) { // this condition to break from loop when take at least 1500 page if (counterofopages == 1600) { break; } // try and catch to throw any exceptions out if it retreive null from innertext or something else int boolll = 0; // boolean to check if the inner text has exception change boolean = 1 and skip tha link try { //===================================================// // retreive from each row docid , url (link) , content of the page (html page) int doc_id = (int)rdr["doc_id"]; string url = (string)rdr["URL"]; string content = (string)rdr["Page_Content"]; //===================================================// // pasre html page from database and get the inner text (step 1) IHTMLDocument2 myDoc = new HTMLDocumentClass(); myDoc.write(content); string elements = myDoc.body.innerText; //===================================================// //(it will be) /// split in (step 2) (to take tokens and save it in array of strings named (tokens) string[] tokens = elements.Split(',', ' ', '.', ':', '\t', '\n', '\r'); int i = 0; // counter to calculate the position for every term // check if any string it will be null or empty tokens = tokens.Where(x => !string.IsNullOrEmpty(x)).ToArray(); //===================================================// /// saves every term and its list (positions) (s in dictionary named (termsandpos) before removing stop words Dictionary <string, List <int> > termsandpos = new Dictionary <string, List <int> >(); foreach (var words in tokens) { List <int> listofpos = new List <int>(); i++; // using regex to remove punctuation characters from every word (step 3) -> req 1 string word = Regex.Replace(words, @"[^\w\d\s]", ""); word = Regex.Replace(word, @"\d", ""); // if the word is empty after removing punctuation characters continues and don't save it if (word == "") { continue; } // using spelling class from netspell reference and create object from it and using it to check if this word is real word in english or not. Spelling ss = new Spelling(); // when the object from spelling class is used , the dialog window will opened and has many feature and i will closed by using next line to continue my run it's not used for my code. ss.ShowDialog = false; // check if this word is not found in dictionary in the spell library , continue ( go to the next word). // esle continue the rest of the code (that is mean the word is found in the dictionary). if (ss.SpellCheck(word)) { continue; } word = word.ToLower(); //case folding in (step 3) -> req 2 //If the word is already existed ,add the new position in the list of this word if (termsandpos.ContainsKey(word)) { listofpos = termsandpos[word]; listofpos.Add(i); termsandpos[word] = listofpos; } // else, add the word and the first position else { listofpos.Add(i); termsandpos.Add(word, listofpos); } } //===================================================// ///// stop words removing in (step 3) -> req 3 /// list of stop words /// create anthor dictinary to copy all terms without stop words Dictionary <string, List <int> > temp = new Dictionary <string, List <int> >(); List <string> stopwords = new List <string>() { "a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thick", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "the" }; for (int f = 0; f < termsandpos.Count; f++) { // if the term is already existed in the stopwords list or the term is a single character like ii or i , continue (and go to the next term). if (stopwords.Contains(termsandpos.Keys.ElementAt(f)) || termsandpos.Keys.ElementAt(f).Length <= 2) { continue; } // else ,that's mean the term is not a stop word then add it and its positions in the temp dictionary. else { List <int> copyofpositions = new List <int>(); copyofpositions = termsandpos[termsandpos.Keys.ElementAt(f)]; temp.Add(termsandpos.Keys.ElementAt(f), copyofpositions); } } //===================================================// //// al stemming algorithm (step 3) --> req 4 var stemmer = new PorterStemmer(); // declare object from claas of porterstemmer algorithm Dictionary <string, List <int> > finalterm = new Dictionary <string, List <int> >(); foreach (KeyValuePair <string, List <int> > iter1 in temp) { //===================================================// // add every term and its docid in table called (TermsBStemming_Table) in db before stemming (the note in step 3 -->req 4) string insertString3 = "INSERT INTO TermsBStemming_Table (termBstemming,docID) VALUES (@termBstemming,@docID)"; SqlCommand cmd3 = new SqlCommand(insertString3, sqlConnection); SqlParameter par1 = new SqlParameter("@termBstemming", iter1.Key); SqlParameter par2 = new SqlParameter("@docID", doc_id); cmd3.Parameters.Add(par1); cmd3.Parameters.Add(par2); cmd3.ExecuteNonQuery(); //===================================================// List <int> listofpositions = new List <int>(); // called function (StemWord) and send the term and return term after stemming string stem = stemmer.StemWord(iter1.Key); // check if this stem is already existed in finalterm dictionary (the new datastructure to save the term and its list after stemmnig) if (finalterm.ContainsKey(stem)) { List <int> tempforsimlir = new List <int>(); tempforsimlir = finalterm[stem]; // take the list of positions for this term (old positions added before for this term) listofpositions = temp[iter1.Key]; // take the list of new positions for this term /// added the new positions and old position in one list for (int j = 0; j < listofpositions.Count; j++) { tempforsimlir.Add(listofpositions[j]); } // and save it again for the term finalterm[stem] = tempforsimlir; } // addd the term ans its list to finalterm dictionary else { listofpositions = temp[iter1.Key]; finalterm.Add(stem, listofpositions); } } //===================================================// //// inverted index (step 4) foreach (KeyValuePair <string, List <int> > iter in finalterm) { int freq = iter.Value.Count; // calculate freq through count number of positions int[] arr = new int[2]; // save in this array doc id and the frequency arr[0] = doc_id; arr[1] = freq; // convert list of the positions for every term to string var resultofpositions = string.Join(", ", iter.Value); //===================================================// // save term and docid ans=d frequency and (list of positions as string ) in table called Inverted_Index in db. string insertString2 = "INSERT INTO Inverted_Index (Term,DocID,Frequency,position) VALUES (@Term,@DocID,@Frequency,@position)"; SqlCommand cmd2 = new SqlCommand(insertString2, sqlConnection); SqlParameter paramter1 = new SqlParameter("@Term", iter.Key); SqlParameter paramter2 = new SqlParameter("@DocID", doc_id); SqlParameter paramter3 = new SqlParameter("@Frequency", freq); SqlParameter paramter4 = new SqlParameter("@position", resultofpositions); cmd2.Parameters.Add(paramter1); cmd2.Parameters.Add(paramter2); cmd2.Parameters.Add(paramter3); cmd2.Parameters.Add(paramter4); cmd2.ExecuteNonQuery(); //===================================================// /// store in index list term and arrof ints (arr[0]=docid,arr[1] = freqs of every term) and list of all positions of this term (if i needed in ranks or something else). indexmap.Add(new KeyValuePair <string, KeyValuePair <int[], List <int> > >(iter.Key, new KeyValuePair <int[], List <int> >(arr, iter.Value))); } //===================================================// } //===================================================// //catch any type of exception and change the boolean that i decalred equal zero catch (NullReferenceException ex) { boolll = 1; Console.WriteLine(ex.Message); } catch (ArgumentOutOfRangeException exx) { boolll = 1; Console.WriteLine(exx.Message); } // if the boolean became equal 1 , then leave this link and go to anthor link if (boolll == 1) { continue; } //===================================================// /// to count number of pages (at least 1500 page) counterofopages++; //===================================================// } //===================================================// // close the reader from database rdr.Close(); /// close the connection sqlConnection.Close(); //===================================================// }
public static void Main(string[] args) { ToolArguments parsedArgs = new ToolArguments(args, "None", new MainClass()); PorterStemmer stemmer = new PorterStemmer(); if (parsedArgs["stem"] != null) Console.WriteLine(parsedArgs["stem"] + " => " + stemmer.stemTerm(parsedArgs["stem"])); /*ANEWEmotionSensor sensor2 = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); for (int rr = 0; rr < sensor2.positiveMatrix.GetLength(0); rr++) { for (int cc = 0; cc < sensor2.positiveMatrix.GetLength(1); cc++) Console.Write(sensor2.positiveMatrix[rr, cc] + ", "); Console.WriteLine(" - "); } for (int rr = 0; rr < sensor2.negativeMatrix.GetLength(0); rr++) { for (int cc = 0; cc < sensor2.negativeMatrix.GetLength(1); cc++) Console.Write(sensor2.negativeMatrix[rr, cc] + ", "); Console.WriteLine(" - "); } return;*/ if (parsedArgs["freqrows"] != null) { DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { TwoTuple<int, int> counts = FrequencyTools.WordCount(parsedArgs["freqrows"], row[1]); Console.WriteLine(counts.one + "," + counts.two + ",\"" + row[2] + "\""); } } if (parsedArgs["emotion"] != null) { ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); double[] emotions = sensor.EstimateEmotions(parsedArgs["emotion"]); for (int ii = 0; ii < (int) ANEWEmotionSensor.Emotions.COUNT; ii++) Console.WriteLine(((ANEWEmotionSensor.Emotions) ii).ToString() + ": " + emotions[ii]); } if (parsedArgs["emorows"] != null) { int rows = 0, valids = 0; ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { rows++; double[] emotions = sensor.EstimateEmotions(row[1]); Console.WriteLine("\"" + row[0] + "\"," + emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7] + ",\"" + row[2] + "\""); if (!double.IsNaN(emotions[0])) valids++; } } if (parsedArgs["eimpute"] != null) { ANEWEmotionSensor sensor = new ANEWEmotionSensor("/Users/jrising/projects/virsona/github/data"); // DIAGNOSTIC /*List<List<string>> rows = new List<List<string>>(); rows.Add(TwitterUtilities.SplitWords("happy aaaa cccc")); rows.Add(TwitterUtilities.SplitWords("sad bbbb cccc")); IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 1000); foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed) Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean);*/ bool smallFile = false; if (smallFile) { DataReader reader = new DataReader(parsedArgs["f"]); List<List<string>> rows = new List<List<string>>(); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { Console.WriteLine(row); rows.Add(TwitterUtilities.SplitWords(row[10].ToLower())); } reader.Close(); /*IDataSource<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> inputed = sensor.ImputeEmotionalContent(rows, 10); double minv = 1, maxv = 0; foreach (KeyValuePair<string, ThreeTuple<ContinuousDistribution, ContinuousDistribution, ContinuousDistribution>> kvp in inputed) { minv = Math.Min(minv, kvp.Value.one.Mean); maxv = Math.Max(maxv, kvp.Value.one.Mean); Console.WriteLine(kvp.Key + ": " + kvp.Value.one.Mean + " x " + kvp.Value.one.Variance + ", " + kvp.Value.two.Mean + ", " + kvp.Value.three.Mean); } Console.WriteLine("Min: " + minv + ", Max: " + maxv);*/ sensor.ImputeEmotionalContent(rows, 10, parsedArgs["f"] + "imputed"); } else { sensor.ImputeEmotionalContentFromFile(parsedArgs["f"], 11, 0, parsedArgs["f"].Substring(0, parsedArgs["f"].Length - 4) + "imputed.csv"); } uint jj = 0; using (var stream = File.CreateText(parsedArgs["f"] + "result")) { jj++; if (jj % 1000 == 0) Console.WriteLine("#" + jj); DataReader reader = new DataReader(parsedArgs["f"]); for (string[] row = reader.ReadRow(); row != null; row = reader.ReadRow()) { double[] emotions = sensor.EstimateEmotions(row[11]); for (int ii = 0; ii < 11; ii++) stream.Write(row[ii] + ","); stream.WriteLine(emotions[0] + "," + emotions[1] + "," + emotions[2] + "," + emotions[3] + "," + emotions[4] + "," + emotions[5] + "," + emotions[6] + "," + emotions[7]); } } } }
public void InvokingTheConstructor_WithNoArguments_ShouldReturnNewInstance() { var stemmer = new PorterStemmer(); Assert.IsInstanceOfType(stemmer, typeof(PorterStemmer)); }