Exemplo n.º 1
0
 private void copy_from(EnglishStemmer other)
 {
     B_Y_found = other.B_Y_found;
     I_p2      = other.I_p2;
     I_p1      = other.I_p1;
     copy_from(other);
 }
Exemplo n.º 2
0
        public List<Term> termsInQuery()
        {
            string documentWithoutPunctuation = "";
            foreach (var c in SearchPhrase)
            {
                if (!char.IsPunctuation(c))
                {
                    documentWithoutPunctuation += c;
                }
            }
            SearchPhrase = documentWithoutPunctuation;

            List<string> termStrings = SearchPhrase.ToLower().Split(' ').ToList();

            List<string> stemmedTerms = new List<string>();
            EnglishStemmer stemmer = new EnglishStemmer();
            foreach(var ts in termStrings)
            {
                stemmedTerms.Add(stemmer.Stem(ts));
            }

            List<Term> terms = new List<Term>();
            foreach(var stemmedterm in stemmedTerms.Distinct())
            {
                if(db.Terms.Any(i=>i.StemmedText == stemmedterm))
                {
                    terms.Add(db.Terms.First(i => i.StemmedText == stemmedterm));
                }
            }
            return terms;
        }
Exemplo n.º 3
0
        private void btnFolder_Click(object sender, EventArgs e)
        {
            FolderBrowserDialog dia = new FolderBrowserDialog();
            DialogResult res = dia.ShowDialog();
            if (res != System.Windows.Forms.DialogResult.OK)
            {
                return;
            }

            FSDirectory dir = FSDirectory.GetDirectory(Environment.CurrentDirectory + "\\LuceneIndex");
            //Lucene.Net.Store.RAMDirectory dir = new RAMDirectory();
            Lucene.Net.Analysis.Standard.StandardAnalyzer an = new Lucene.Net.Analysis.Standard.StandardAnalyzer();
            IndexWriter wr = new IndexWriter(dir, an,true);
            IStemmer stemmer = new EnglishStemmer();
            DirectoryInfo diMain = new DirectoryInfo(dia.SelectedPath);
            foreach(FileInfo fi in diMain.GetFiles()){
                Document doc = new Document();
                doc.Add(new Field("title", fi.Name,Field.Store.YES, Field.Index.NO));
                //doc.Add(new Field("text", File.ReadAllText(fi.FullName),Field.Store.YES, Field.Index.TOKENIZED,Field.TermVector.YES));
                doc.Add(new Field("text", PerformStemming(stemmer,NLPToolkit.Tokenizer.TokenizeNow(File.ReadAllText(fi.FullName)).ToArray()), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
                wr.AddDocument(doc);
            }
            wr.Optimize();
            wr.Flush();
            wr.Close();
            dir.Close();

            IndexReader reader = IndexReader.Open(dir);
            for (int i = 0; i < reader.MaxDoc(); i++)
            {
                if (reader.IsDeleted(i))
                    continue;

                Document doc = reader.Document(i);
                String docId = doc.Get("docId");
                foreach (TermFreqVector vector in reader.GetTermFreqVectors(i))
                {
                    foreach(string term in vector.GetTerms()){
                        Console.WriteLine(term);
                    }
                }
                // do something with docId here...
            }
            //IndexSearcher search = new IndexSearcher(wr.GetReader());

            //MoreLikeThis mlt = new MoreLikeThis(wr.GetReader());
            //FileInfo fitarget = new FileInfo(@"C:\Users\peacemaker\Desktop\TestNoBitcoin\test.txt");
            //Query query = mlt.Like(fitarget);

            //var hits = search.Search(query, int.MaxValue);
            //foreach (ScoreDoc doc in hits.ScoreDocs)
            //{
            //    textBox1.Text += doc.Score + Environment.NewLine;
            //}
        }
        public static void CreateIndex(MongoCollection<TweetItem> collection)
        {
            DateTime dtmFirst = new DateTime(2014, 05, 17, 0, 0, 0);
            DateTime dtmLast = new DateTime(2014, 05, 17, 23, 59, 59);
            FSDirectory dir = FSDirectory.GetDirectory(Environment.CurrentDirectory + "\\LuceneIndex");
            //Lucene.Net.Store.RAMDirectory dir = new RAMDirectory();
            Lucene.Net.Analysis.StopAnalyzer an = new Lucene.Net.Analysis.StopAnalyzer();
            IndexWriter wr = new IndexWriter(dir, an, true);
            IStemmer stemmer = new EnglishStemmer();
            while (dtmFirst.Date <= DateTime.Now.Date)
            {
                var query = Query<TweetItem>.Where(t => t.CreationDate >= dtmFirst && t.CreationDate <= dtmLast);
                List<TweetItem> value = collection.Find(query).ToList();
                //DirectoryInfo diMain = new DirectoryInfo(dia.SelectedPath);               
                using (var client = new HttpClient())
                {
                    client.BaseAddress = new Uri("http://www.datasciencetoolkit.org/text2sentiment");
                    client.DefaultRequestHeaders.Accept.Clear();
                    client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));

                    foreach (TweetItem tweet in value)
                    {
                        Document doc = new Document();
                        
                        //SentimentResult res = await GetSentiment(tweet.Text, client);                        
                        string stemmedtext = PerformStemming(stemmer, NLPToolkit.Tokenizer.TokenizeNow(tweet.Text).ToArray());
                        var scores = classifier.Classify(stemmedtext,DragonHelper.DragonHelper.ExcludeList);
                        string positiveSentiment = string.Empty;
                        string negativeSentiment = string.Empty;                        
                        positiveSentiment = scores["Positive"].ToString();
                        negativeSentiment = scores["Negative"].ToString();
                        doc.Add(new Field("id", tweet._id.ToString(), Field.Store.YES, Field.Index.NO));
                        doc.Add(new Field("created", tweet.CreationDate.ToString(), Field.Store.YES, Field.Index.NO));
                        doc.Add(new Field("user", tweet.User, Field.Store.YES, Field.Index.NO));                        
                        doc.Add(new Field("text", stemmedtext, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));                        
                        doc.Add(new Field("possentiment", positiveSentiment , Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));
                        doc.Add(new Field("negsentiment", negativeSentiment, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES));

                        wr.AddDocument(doc);
                    }
                }
                dtmFirst = dtmFirst.AddDays(1);
                dtmLast = dtmLast.AddDays(1);
            }
            wr.Optimize();
            wr.Flush();
            wr.Close();
            dir.Close();
        }
Exemplo n.º 5
0
        private string StemWords(string opText)
        {
            Dictionary<string, int> currentPostStems = new Dictionary<string, int>();

            string[] opArr = opText.Split(null);
            string outputText = "";
            //currentPostStems = TestStemmer(new EnglishStemmer(), opArr);
            //foreach (KeyValuePair<string, int> stem in currentPostStems)
            //{
            //    outputText += "Stem: " + stem.Key + "\t\tOccurs: " + stem.Value + Environment.NewLine;
            //}
            EnglishStemmer stemmer = new EnglishStemmer();
            foreach (string word in opArr) {
                outputText += stemmer.Stem(word) + " ";
            }

            return outputText;
        }
 public static void InitParams(List<string> t)
 {
     Eng = new StWdsEng();
     Rus = new StWdsRus();
     EngStem = new EnglishStemmer();
     RusStem = new RussianStemmer();
     ts = new List<string>();
     ts = t;
     Tag = new List<List<Word>>();
 }
 public static void InitParams(string[] t)
 {
     string[] temp = t;
     Eng = new StWdsEng();
     Rus = new StWdsRus();
     EngStem = new EnglishStemmer();
     RusStem = new RussianStemmer();
     ts = new List<string>();
     ts.AddRange(temp);
     Tag = new List<List<Word>>();
 }
 private void copy_from(EnglishStemmer other)
 {
     B_Y_found = other.B_Y_found;
     I_p2 = other.I_p2;
     I_p1 = other.I_p1;
     copy_from(other);
 }
 public static void InitParams(List<string> t)
 {
     Eng = new StWdsEng();
     Rus = new StWdsRus();
     EngStem = new EnglishStemmer();
     RusStem = new RussianStemmer();
     ts = new List<string>();
     ts = t;
     fullWdsCollection = new List<List<string>>[2];
     fullWdsCollection[0] = new List<List<string>>();
     fullWdsCollection[1] = new List<List<string>>();
     Tag = new List<List<Word>>[2];
     Tag[0] = new List<List<Word>>();
     Tag[1] = new List<List<Word>>();
 }
 public ExternalEnglishStemmer()
 {
     stemmer = new EnglishStemmer();
 }
        private void processDocument(Document document, int totalNoOfDocumentsInCollection, List<Term> stopwords, List<Term> allTerms)
        {
            // remove all punctuation
            string documentWithoutPunctuation = "";
            foreach (var c in document.Content)
            {
                if (!char.IsPunctuation(c))
                {
                    documentWithoutPunctuation += c;
                }
            }

            //put the terms in the document onto a string list, splitting whenever there is a space, and whenever there is \r\n in document.
            string[] splitStrings = { " ", "\r\n" };
            List<string> rawTermsInDocument = documentWithoutPunctuation.ToLower().Split(splitStrings, StringSplitOptions.RemoveEmptyEntries).ToList();

            //stems each term in the document, and then adds it to db.terms if it hasn't been encountered before. Also creates a list of all of the terms in the document in stemmed format
            EnglishStemmer stemmer = new EnglishStemmer();
            List<string> stemmedTermsInDocument = new List<string>();
            foreach (var rawTerm in rawTermsInDocument)
            {
                string stemmedTerm = stemmer.Stem(rawTerm);
                stemmedTermsInDocument.Add(stemmedTerm);
                Term termObj = new Term { StemmedText = stemmedTerm, Text = rawTerm };
                var test = db.Terms.Where(i => i.StemmedText == stemmedTerm).Any();
                if (allTerms.Where(i => i.StemmedText == stemmedTerm).Any() == false && !stopwords.Contains(termObj))
                {
                    db.Terms.Add(termObj);
                    db.SaveChanges();
                    allTerms.Add(termObj);
                }
            }

            /*goes through all db.terms (ie all terms in the dictionary of sorts) and if the term is in the document, then it creates a term document weight
            for the term, and also computes its term frequency*/
            foreach (var term in allTerms)
            {
                if (stemmedTermsInDocument.Contains(term.StemmedText) && !stopwords.Contains(term))
                {
                    TermDocumentWeight termDocumentWeight = new TermDocumentWeight();
                    termDocumentWeight.DocumentID = document.ID;
                    termDocumentWeight.TermID = term.ID;
                    termDocumentWeight.TermFrequency = stemmedTermsInDocument.Count(i => i == term.StemmedText);
                    db.TermDocumentWeights.Add(termDocumentWeight);
                    Debug.WriteLine("TDW for " + termDocumentWeight.Term + " added, ID: " + termDocumentWeight.ID);
                }
            }
            db.SaveChanges();
        }