private void copy_from(EnglishStemmer other) { B_Y_found = other.B_Y_found; I_p2 = other.I_p2; I_p1 = other.I_p1; copy_from(other); }
public List<Term> termsInQuery() { string documentWithoutPunctuation = ""; foreach (var c in SearchPhrase) { if (!char.IsPunctuation(c)) { documentWithoutPunctuation += c; } } SearchPhrase = documentWithoutPunctuation; List<string> termStrings = SearchPhrase.ToLower().Split(' ').ToList(); List<string> stemmedTerms = new List<string>(); EnglishStemmer stemmer = new EnglishStemmer(); foreach(var ts in termStrings) { stemmedTerms.Add(stemmer.Stem(ts)); } List<Term> terms = new List<Term>(); foreach(var stemmedterm in stemmedTerms.Distinct()) { if(db.Terms.Any(i=>i.StemmedText == stemmedterm)) { terms.Add(db.Terms.First(i => i.StemmedText == stemmedterm)); } } return terms; }
private void btnFolder_Click(object sender, EventArgs e) { FolderBrowserDialog dia = new FolderBrowserDialog(); DialogResult res = dia.ShowDialog(); if (res != System.Windows.Forms.DialogResult.OK) { return; } FSDirectory dir = FSDirectory.GetDirectory(Environment.CurrentDirectory + "\\LuceneIndex"); //Lucene.Net.Store.RAMDirectory dir = new RAMDirectory(); Lucene.Net.Analysis.Standard.StandardAnalyzer an = new Lucene.Net.Analysis.Standard.StandardAnalyzer(); IndexWriter wr = new IndexWriter(dir, an,true); IStemmer stemmer = new EnglishStemmer(); DirectoryInfo diMain = new DirectoryInfo(dia.SelectedPath); foreach(FileInfo fi in diMain.GetFiles()){ Document doc = new Document(); doc.Add(new Field("title", fi.Name,Field.Store.YES, Field.Index.NO)); //doc.Add(new Field("text", File.ReadAllText(fi.FullName),Field.Store.YES, Field.Index.TOKENIZED,Field.TermVector.YES)); doc.Add(new Field("text", PerformStemming(stemmer,NLPToolkit.Tokenizer.TokenizeNow(File.ReadAllText(fi.FullName)).ToArray()), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); wr.AddDocument(doc); } wr.Optimize(); wr.Flush(); wr.Close(); dir.Close(); IndexReader reader = IndexReader.Open(dir); for (int i = 0; i < reader.MaxDoc(); i++) { if (reader.IsDeleted(i)) continue; Document doc = reader.Document(i); String docId = doc.Get("docId"); foreach (TermFreqVector vector in reader.GetTermFreqVectors(i)) { foreach(string term in vector.GetTerms()){ Console.WriteLine(term); } } // do something with docId here... } //IndexSearcher search = new IndexSearcher(wr.GetReader()); //MoreLikeThis mlt = new MoreLikeThis(wr.GetReader()); //FileInfo fitarget = new FileInfo(@"C:\Users\peacemaker\Desktop\TestNoBitcoin\test.txt"); //Query query = mlt.Like(fitarget); //var hits = search.Search(query, int.MaxValue); //foreach (ScoreDoc doc in hits.ScoreDocs) //{ // textBox1.Text += doc.Score + Environment.NewLine; //} }
public static void CreateIndex(MongoCollection<TweetItem> collection) { DateTime dtmFirst = new DateTime(2014, 05, 17, 0, 0, 0); DateTime dtmLast = new DateTime(2014, 05, 17, 23, 59, 59); FSDirectory dir = FSDirectory.GetDirectory(Environment.CurrentDirectory + "\\LuceneIndex"); //Lucene.Net.Store.RAMDirectory dir = new RAMDirectory(); Lucene.Net.Analysis.StopAnalyzer an = new Lucene.Net.Analysis.StopAnalyzer(); IndexWriter wr = new IndexWriter(dir, an, true); IStemmer stemmer = new EnglishStemmer(); while (dtmFirst.Date <= DateTime.Now.Date) { var query = Query<TweetItem>.Where(t => t.CreationDate >= dtmFirst && t.CreationDate <= dtmLast); List<TweetItem> value = collection.Find(query).ToList(); //DirectoryInfo diMain = new DirectoryInfo(dia.SelectedPath); using (var client = new HttpClient()) { client.BaseAddress = new Uri("http://www.datasciencetoolkit.org/text2sentiment"); client.DefaultRequestHeaders.Accept.Clear(); client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); foreach (TweetItem tweet in value) { Document doc = new Document(); //SentimentResult res = await GetSentiment(tweet.Text, client); string stemmedtext = PerformStemming(stemmer, NLPToolkit.Tokenizer.TokenizeNow(tweet.Text).ToArray()); var scores = classifier.Classify(stemmedtext,DragonHelper.DragonHelper.ExcludeList); string positiveSentiment = string.Empty; string negativeSentiment = string.Empty; positiveSentiment = scores["Positive"].ToString(); negativeSentiment = scores["Negative"].ToString(); doc.Add(new Field("id", tweet._id.ToString(), Field.Store.YES, Field.Index.NO)); doc.Add(new Field("created", tweet.CreationDate.ToString(), Field.Store.YES, Field.Index.NO)); doc.Add(new Field("user", tweet.User, Field.Store.YES, Field.Index.NO)); doc.Add(new Field("text", stemmedtext, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("possentiment", positiveSentiment , Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.Add(new Field("negsentiment", negativeSentiment, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); wr.AddDocument(doc); } } dtmFirst = dtmFirst.AddDays(1); dtmLast = dtmLast.AddDays(1); } wr.Optimize(); wr.Flush(); wr.Close(); dir.Close(); }
private string StemWords(string opText) { Dictionary<string, int> currentPostStems = new Dictionary<string, int>(); string[] opArr = opText.Split(null); string outputText = ""; //currentPostStems = TestStemmer(new EnglishStemmer(), opArr); //foreach (KeyValuePair<string, int> stem in currentPostStems) //{ // outputText += "Stem: " + stem.Key + "\t\tOccurs: " + stem.Value + Environment.NewLine; //} EnglishStemmer stemmer = new EnglishStemmer(); foreach (string word in opArr) { outputText += stemmer.Stem(word) + " "; } return outputText; }
public static void InitParams(List<string> t) { Eng = new StWdsEng(); Rus = new StWdsRus(); EngStem = new EnglishStemmer(); RusStem = new RussianStemmer(); ts = new List<string>(); ts = t; Tag = new List<List<Word>>(); }
public static void InitParams(string[] t) { string[] temp = t; Eng = new StWdsEng(); Rus = new StWdsRus(); EngStem = new EnglishStemmer(); RusStem = new RussianStemmer(); ts = new List<string>(); ts.AddRange(temp); Tag = new List<List<Word>>(); }
public static void InitParams(List<string> t) { Eng = new StWdsEng(); Rus = new StWdsRus(); EngStem = new EnglishStemmer(); RusStem = new RussianStemmer(); ts = new List<string>(); ts = t; fullWdsCollection = new List<List<string>>[2]; fullWdsCollection[0] = new List<List<string>>(); fullWdsCollection[1] = new List<List<string>>(); Tag = new List<List<Word>>[2]; Tag[0] = new List<List<Word>>(); Tag[1] = new List<List<Word>>(); }
public ExternalEnglishStemmer() { stemmer = new EnglishStemmer(); }
private void processDocument(Document document, int totalNoOfDocumentsInCollection, List<Term> stopwords, List<Term> allTerms) { // remove all punctuation string documentWithoutPunctuation = ""; foreach (var c in document.Content) { if (!char.IsPunctuation(c)) { documentWithoutPunctuation += c; } } //put the terms in the document onto a string list, splitting whenever there is a space, and whenever there is \r\n in document. string[] splitStrings = { " ", "\r\n" }; List<string> rawTermsInDocument = documentWithoutPunctuation.ToLower().Split(splitStrings, StringSplitOptions.RemoveEmptyEntries).ToList(); //stems each term in the document, and then adds it to db.terms if it hasn't been encountered before. Also creates a list of all of the terms in the document in stemmed format EnglishStemmer stemmer = new EnglishStemmer(); List<string> stemmedTermsInDocument = new List<string>(); foreach (var rawTerm in rawTermsInDocument) { string stemmedTerm = stemmer.Stem(rawTerm); stemmedTermsInDocument.Add(stemmedTerm); Term termObj = new Term { StemmedText = stemmedTerm, Text = rawTerm }; var test = db.Terms.Where(i => i.StemmedText == stemmedTerm).Any(); if (allTerms.Where(i => i.StemmedText == stemmedTerm).Any() == false && !stopwords.Contains(termObj)) { db.Terms.Add(termObj); db.SaveChanges(); allTerms.Add(termObj); } } /*goes through all db.terms (ie all terms in the dictionary of sorts) and if the term is in the document, then it creates a term document weight for the term, and also computes its term frequency*/ foreach (var term in allTerms) { if (stemmedTermsInDocument.Contains(term.StemmedText) && !stopwords.Contains(term)) { TermDocumentWeight termDocumentWeight = new TermDocumentWeight(); termDocumentWeight.DocumentID = document.ID; termDocumentWeight.TermID = term.ID; termDocumentWeight.TermFrequency = stemmedTermsInDocument.Count(i => i == term.StemmedText); db.TermDocumentWeights.Add(termDocumentWeight); Debug.WriteLine("TDW for " + termDocumentWeight.Term + " added, ID: " + termDocumentWeight.ID); } } db.SaveChanges(); }