public ActionResult Create(Document document, Boolean allFormat)
        {
            if (allFormat == true)
            {
                allFormatInput(document);
            }
            else
            {
                //add document to the database
                if (ModelState.IsValid)
                {
                    db.Documents.Add(document);
                    db.SaveChanges();
                }
                else
                {
                    return View();
                }
                List<Term> stopwords = new List<Term>();
                foreach (var sw in db.Stopwords.ToList())
                {
                    stopwords.Add(db.Terms.Find(sw.TermID));
                }
                processDocument(document, db.Documents.Count(), stopwords, db.Terms.ToList());
            }

            return RedirectToAction("Index");
        }
Example #2
0
 public int getFrequency(Document d, List<TermDocumentWeight> termDocumentWeights)
 {
     return termDocumentWeights.Where(i => i.TermID == ID && i.DocumentID == d.ID).First().TermFrequency;
 }
        private void processDocument(Document document, int totalNoOfDocumentsInCollection, List<Term> stopwords, List<Term> allTerms)
        {
            // remove all punctuation
            string documentWithoutPunctuation = "";
            foreach (var c in document.Content)
            {
                if (!char.IsPunctuation(c))
                {
                    documentWithoutPunctuation += c;
                }
            }

            //put the terms in the document onto a string list, splitting whenever there is a space, and whenever there is \r\n in document.
            string[] splitStrings = { " ", "\r\n" };
            List<string> rawTermsInDocument = documentWithoutPunctuation.ToLower().Split(splitStrings, StringSplitOptions.RemoveEmptyEntries).ToList();

            //stems each term in the document, and then adds it to db.terms if it hasn't been encountered before. Also creates a list of all of the terms in the document in stemmed format
            EnglishStemmer stemmer = new EnglishStemmer();
            List<string> stemmedTermsInDocument = new List<string>();
            foreach (var rawTerm in rawTermsInDocument)
            {
                string stemmedTerm = stemmer.Stem(rawTerm);
                stemmedTermsInDocument.Add(stemmedTerm);
                Term termObj = new Term { StemmedText = stemmedTerm, Text = rawTerm };
                var test = db.Terms.Where(i => i.StemmedText == stemmedTerm).Any();
                if (allTerms.Where(i => i.StemmedText == stemmedTerm).Any() == false && !stopwords.Contains(termObj))
                {
                    db.Terms.Add(termObj);
                    db.SaveChanges();
                    allTerms.Add(termObj);
                }
            }

            /*goes through all db.terms (ie all terms in the dictionary of sorts) and if the term is in the document, then it creates a term document weight
            for the term, and also computes its term frequency*/
            foreach (var term in allTerms)
            {
                if (stemmedTermsInDocument.Contains(term.StemmedText) && !stopwords.Contains(term))
                {
                    TermDocumentWeight termDocumentWeight = new TermDocumentWeight();
                    termDocumentWeight.DocumentID = document.ID;
                    termDocumentWeight.TermID = term.ID;
                    termDocumentWeight.TermFrequency = stemmedTermsInDocument.Count(i => i == term.StemmedText);
                    db.TermDocumentWeights.Add(termDocumentWeight);
                    Debug.WriteLine("TDW for " + termDocumentWeight.Term + " added, ID: " + termDocumentWeight.ID);
                }
            }
            db.SaveChanges();
        }
        private void allFormatInput(Document document)
        {
            var dotIRegex = new Regex(@"^\.I\s+(\d+)$");
            var dotWRegex = new Regex(@"^\.W$");
            //retrieve all the stopwords from db.terms and put them a list
            List<Term> stopwords = new List<Term>();
            foreach (var sw in db.Stopwords.ToList())
            {
                stopwords.Add(db.Terms.Find(sw.TermID));
            }
            using (StringReader reader = new StringReader(document.Content))
            {
                string line;
                Document doc = null;
                int count = 0;
                int totalNoOfDocuments = db.Documents.Count();
                while ((line = reader.ReadLine()) != null)
                {
                    String iValue = matchValue(dotIRegex, line);
                    String wValue = matchValue(dotWRegex, line);

                    //the line is a .I line
                    if (iValue != null)
                    {
                        //if this is the first line, don't add a new document
                        if (count != 0)
                        {
                            db.Documents.Add(doc);
                            db.SaveChanges();
                            processDocument(doc, totalNoOfDocuments, stopwords, db.Terms.ToList());
                            doc = new Document { Name = iValue };
                        }
                        else
                        {
                            doc = new Document { Name = iValue };
                        }
                    }
                    //the line is a .W or other line
                    else
                    {
                        //don't add the line if the line has a .W
                        if(wValue == null)
                        {
                            doc.Content += line;
                        }
                    }
                    count++;
                }
                db.Documents.Add(doc);
                db.SaveChanges();
                processDocument(doc, totalNoOfDocuments, stopwords, db.Terms.ToList());
            }
        }