Exemplo n.º 1
0
        public List<Term> termsInQuery()
        {
            string documentWithoutPunctuation = "";
            foreach (var c in SearchPhrase)
            {
                if (!char.IsPunctuation(c))
                {
                    documentWithoutPunctuation += c;
                }
            }
            SearchPhrase = documentWithoutPunctuation;

            List<string> termStrings = SearchPhrase.ToLower().Split(' ').ToList();

            List<string> stemmedTerms = new List<string>();
            EnglishStemmer stemmer = new EnglishStemmer();
            foreach(var ts in termStrings)
            {
                stemmedTerms.Add(stemmer.Stem(ts));
            }

            List<Term> terms = new List<Term>();
            foreach(var stemmedterm in stemmedTerms.Distinct())
            {
                if(db.Terms.Any(i=>i.StemmedText == stemmedterm))
                {
                    terms.Add(db.Terms.First(i => i.StemmedText == stemmedterm));
                }
            }
            return terms;
        }
Exemplo n.º 2
0
        private string StemWords(string opText)
        {
            Dictionary<string, int> currentPostStems = new Dictionary<string, int>();

            string[] opArr = opText.Split(null);
            string outputText = "";
            //currentPostStems = TestStemmer(new EnglishStemmer(), opArr);
            //foreach (KeyValuePair<string, int> stem in currentPostStems)
            //{
            //    outputText += "Stem: " + stem.Key + "\t\tOccurs: " + stem.Value + Environment.NewLine;
            //}
            EnglishStemmer stemmer = new EnglishStemmer();
            foreach (string word in opArr) {
                outputText += stemmer.Stem(word) + " ";
            }

            return outputText;
        }
        private void processDocument(Document document, int totalNoOfDocumentsInCollection, List<Term> stopwords, List<Term> allTerms)
        {
            // remove all punctuation
            string documentWithoutPunctuation = "";
            foreach (var c in document.Content)
            {
                if (!char.IsPunctuation(c))
                {
                    documentWithoutPunctuation += c;
                }
            }

            //put the terms in the document onto a string list, splitting whenever there is a space, and whenever there is \r\n in document.
            string[] splitStrings = { " ", "\r\n" };
            List<string> rawTermsInDocument = documentWithoutPunctuation.ToLower().Split(splitStrings, StringSplitOptions.RemoveEmptyEntries).ToList();

            //stems each term in the document, and then adds it to db.terms if it hasn't been encountered before. Also creates a list of all of the terms in the document in stemmed format
            EnglishStemmer stemmer = new EnglishStemmer();
            List<string> stemmedTermsInDocument = new List<string>();
            foreach (var rawTerm in rawTermsInDocument)
            {
                string stemmedTerm = stemmer.Stem(rawTerm);
                stemmedTermsInDocument.Add(stemmedTerm);
                Term termObj = new Term { StemmedText = stemmedTerm, Text = rawTerm };
                var test = db.Terms.Where(i => i.StemmedText == stemmedTerm).Any();
                if (allTerms.Where(i => i.StemmedText == stemmedTerm).Any() == false && !stopwords.Contains(termObj))
                {
                    db.Terms.Add(termObj);
                    db.SaveChanges();
                    allTerms.Add(termObj);
                }
            }

            /*goes through all db.terms (ie all terms in the dictionary of sorts) and if the term is in the document, then it creates a term document weight
            for the term, and also computes its term frequency*/
            foreach (var term in allTerms)
            {
                if (stemmedTermsInDocument.Contains(term.StemmedText) && !stopwords.Contains(term))
                {
                    TermDocumentWeight termDocumentWeight = new TermDocumentWeight();
                    termDocumentWeight.DocumentID = document.ID;
                    termDocumentWeight.TermID = term.ID;
                    termDocumentWeight.TermFrequency = stemmedTermsInDocument.Count(i => i == term.StemmedText);
                    db.TermDocumentWeights.Add(termDocumentWeight);
                    Debug.WriteLine("TDW for " + termDocumentWeight.Term + " added, ID: " + termDocumentWeight.ID);
                }
            }
            db.SaveChanges();
        }