public List<Term> termsInQuery() { string documentWithoutPunctuation = ""; foreach (var c in SearchPhrase) { if (!char.IsPunctuation(c)) { documentWithoutPunctuation += c; } } SearchPhrase = documentWithoutPunctuation; List<string> termStrings = SearchPhrase.ToLower().Split(' ').ToList(); List<string> stemmedTerms = new List<string>(); EnglishStemmer stemmer = new EnglishStemmer(); foreach(var ts in termStrings) { stemmedTerms.Add(stemmer.Stem(ts)); } List<Term> terms = new List<Term>(); foreach(var stemmedterm in stemmedTerms.Distinct()) { if(db.Terms.Any(i=>i.StemmedText == stemmedterm)) { terms.Add(db.Terms.First(i => i.StemmedText == stemmedterm)); } } return terms; }
private string StemWords(string opText) { Dictionary<string, int> currentPostStems = new Dictionary<string, int>(); string[] opArr = opText.Split(null); string outputText = ""; //currentPostStems = TestStemmer(new EnglishStemmer(), opArr); //foreach (KeyValuePair<string, int> stem in currentPostStems) //{ // outputText += "Stem: " + stem.Key + "\t\tOccurs: " + stem.Value + Environment.NewLine; //} EnglishStemmer stemmer = new EnglishStemmer(); foreach (string word in opArr) { outputText += stemmer.Stem(word) + " "; } return outputText; }
private void processDocument(Document document, int totalNoOfDocumentsInCollection, List<Term> stopwords, List<Term> allTerms) { // remove all punctuation string documentWithoutPunctuation = ""; foreach (var c in document.Content) { if (!char.IsPunctuation(c)) { documentWithoutPunctuation += c; } } //put the terms in the document onto a string list, splitting whenever there is a space, and whenever there is \r\n in document. string[] splitStrings = { " ", "\r\n" }; List<string> rawTermsInDocument = documentWithoutPunctuation.ToLower().Split(splitStrings, StringSplitOptions.RemoveEmptyEntries).ToList(); //stems each term in the document, and then adds it to db.terms if it hasn't been encountered before. Also creates a list of all of the terms in the document in stemmed format EnglishStemmer stemmer = new EnglishStemmer(); List<string> stemmedTermsInDocument = new List<string>(); foreach (var rawTerm in rawTermsInDocument) { string stemmedTerm = stemmer.Stem(rawTerm); stemmedTermsInDocument.Add(stemmedTerm); Term termObj = new Term { StemmedText = stemmedTerm, Text = rawTerm }; var test = db.Terms.Where(i => i.StemmedText == stemmedTerm).Any(); if (allTerms.Where(i => i.StemmedText == stemmedTerm).Any() == false && !stopwords.Contains(termObj)) { db.Terms.Add(termObj); db.SaveChanges(); allTerms.Add(termObj); } } /*goes through all db.terms (ie all terms in the dictionary of sorts) and if the term is in the document, then it creates a term document weight for the term, and also computes its term frequency*/ foreach (var term in allTerms) { if (stemmedTermsInDocument.Contains(term.StemmedText) && !stopwords.Contains(term)) { TermDocumentWeight termDocumentWeight = new TermDocumentWeight(); termDocumentWeight.DocumentID = document.ID; termDocumentWeight.TermID = term.ID; termDocumentWeight.TermFrequency = stemmedTermsInDocument.Count(i => i == term.StemmedText); db.TermDocumentWeights.Add(termDocumentWeight); Debug.WriteLine("TDW for " + termDocumentWeight.Term + " added, ID: " + termDocumentWeight.ID); } } db.SaveChanges(); }