public ActionResult Create(Document document, Boolean allFormat) { if (allFormat == true) { allFormatInput(document); } else { //add document to the database if (ModelState.IsValid) { db.Documents.Add(document); db.SaveChanges(); } else { return View(); } List<Term> stopwords = new List<Term>(); foreach (var sw in db.Stopwords.ToList()) { stopwords.Add(db.Terms.Find(sw.TermID)); } processDocument(document, db.Documents.Count(), stopwords, db.Terms.ToList()); } return RedirectToAction("Index"); }
public int getFrequency(Document d, List<TermDocumentWeight> termDocumentWeights) { return termDocumentWeights.Where(i => i.TermID == ID && i.DocumentID == d.ID).First().TermFrequency; }
private void processDocument(Document document, int totalNoOfDocumentsInCollection, List<Term> stopwords, List<Term> allTerms) { // remove all punctuation string documentWithoutPunctuation = ""; foreach (var c in document.Content) { if (!char.IsPunctuation(c)) { documentWithoutPunctuation += c; } } //put the terms in the document onto a string list, splitting whenever there is a space, and whenever there is \r\n in document. string[] splitStrings = { " ", "\r\n" }; List<string> rawTermsInDocument = documentWithoutPunctuation.ToLower().Split(splitStrings, StringSplitOptions.RemoveEmptyEntries).ToList(); //stems each term in the document, and then adds it to db.terms if it hasn't been encountered before. Also creates a list of all of the terms in the document in stemmed format EnglishStemmer stemmer = new EnglishStemmer(); List<string> stemmedTermsInDocument = new List<string>(); foreach (var rawTerm in rawTermsInDocument) { string stemmedTerm = stemmer.Stem(rawTerm); stemmedTermsInDocument.Add(stemmedTerm); Term termObj = new Term { StemmedText = stemmedTerm, Text = rawTerm }; var test = db.Terms.Where(i => i.StemmedText == stemmedTerm).Any(); if (allTerms.Where(i => i.StemmedText == stemmedTerm).Any() == false && !stopwords.Contains(termObj)) { db.Terms.Add(termObj); db.SaveChanges(); allTerms.Add(termObj); } } /*goes through all db.terms (ie all terms in the dictionary of sorts) and if the term is in the document, then it creates a term document weight for the term, and also computes its term frequency*/ foreach (var term in allTerms) { if (stemmedTermsInDocument.Contains(term.StemmedText) && !stopwords.Contains(term)) { TermDocumentWeight termDocumentWeight = new TermDocumentWeight(); termDocumentWeight.DocumentID = document.ID; termDocumentWeight.TermID = term.ID; termDocumentWeight.TermFrequency = stemmedTermsInDocument.Count(i => i == term.StemmedText); db.TermDocumentWeights.Add(termDocumentWeight); Debug.WriteLine("TDW for " + termDocumentWeight.Term + " added, ID: " + termDocumentWeight.ID); } } db.SaveChanges(); }
private void allFormatInput(Document document) { var dotIRegex = new Regex(@"^\.I\s+(\d+)$"); var dotWRegex = new Regex(@"^\.W$"); //retrieve all the stopwords from db.terms and put them a list List<Term> stopwords = new List<Term>(); foreach (var sw in db.Stopwords.ToList()) { stopwords.Add(db.Terms.Find(sw.TermID)); } using (StringReader reader = new StringReader(document.Content)) { string line; Document doc = null; int count = 0; int totalNoOfDocuments = db.Documents.Count(); while ((line = reader.ReadLine()) != null) { String iValue = matchValue(dotIRegex, line); String wValue = matchValue(dotWRegex, line); //the line is a .I line if (iValue != null) { //if this is the first line, don't add a new document if (count != 0) { db.Documents.Add(doc); db.SaveChanges(); processDocument(doc, totalNoOfDocuments, stopwords, db.Terms.ToList()); doc = new Document { Name = iValue }; } else { doc = new Document { Name = iValue }; } } //the line is a .W or other line else { //don't add the line if the line has a .W if(wValue == null) { doc.Content += line; } } count++; } db.Documents.Add(doc); db.SaveChanges(); processDocument(doc, totalNoOfDocuments, stopwords, db.Terms.ToList()); } }