public TermExtractor() { termCollectors = new Hashtable(); stopList = new d.StopList(); stemmer = new PorterStemmer(); parseHelper = new ParseHelper(); delims = parseHelper.GetDelims(); }
public TermParser() { stopList = new d.StopList(); delims = parseHelper.GetDelims(); stemmer = new PorterStemmer(); termData = new d.TermData(); DocData = new d.DocData(); termdocData = new d.TermDocData(); parseHelper = new ParseHelper(); }
private void loadText(string text) { d.StopList stoplist = new d.StopList(); PorterStemmer stemmer = new PorterStemmer(); ParseHelper ph = new ParseHelper(); text = text.ToLower(); char[] delims = ph.GetDelims(); string[] temp = text.Split(delims); Hashtable terms = new Hashtable(); string term; for (int i = 0; i < temp.Length; i++) { term = stemmer.stemTerm(temp[i]); if (!stoplist.Contains(term)) { if (terms.Contains(term)) { terms[term] = Convert.ToInt32(terms[term]) + 1; } else { terms.Add(term, 1); } } } StringBuilder sbt = new StringBuilder(); StringBuilder sbw = new StringBuilder(); IDictionaryEnumerator en = terms.GetEnumerator(); string t; string w; while (en.MoveNext()) { t = en.Key.ToString(); w = terms[t].ToString(); sbt.AppendFormat("{0},", t); sbw.AppendFormat("{0},", w); } queryTerms = sbt.ToString(); queryWeights = sbw.ToString(); }
public void AddAnchorText() { d.StopList stopList = new d.StopList(); ParseHelper parseHelper = new ParseHelper(); char[] delims = parseHelper.GetDelims(); PorterStemmer stemmer = new PorterStemmer(); d.LinkData ld = new d.LinkData(); d.TermDocData tdd = new d.TermDocData(); DataTable linksTable; int docId; StringBuilder sb; string[] terms; string term; Hashtable currTerms; DataTable dt = new d.DocData().GetIds(); for (int i = 0; i < dt.Rows.Count; i++) { if (i % 10 == 0) { Console.WriteLine(i); } //accumulate all link text for this doc into StringBuilder sb = new StringBuilder(); docId = (int)dt.Rows[i][0]; linksTable = ld.GetRecordsByToId(docId); foreach (DataRow dr in linksTable.Rows) { sb.AppendFormat("{0} ", dr[0].ToString()); } //accum terms + counts into currTerms hashtable currTerms = new Hashtable(); terms = sb.ToString().Split(delims); for (int j = 0; j < terms.Length; j++) { term = stemmer.stemTerm(terms[j].ToLower().Trim()); if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term)) { if (!currTerms.Contains(term)) { currTerms.Add(term, 1); } else { currTerms[term] = (int)currTerms[term] + 1; } } } //write terms and counts to database IDictionaryEnumerator en = currTerms.GetEnumerator(); string currTerm; int currCount; while (en.MoveNext()) { currTerm = en.Key.ToString(); currCount = (int)currTerms[currTerm]; tdd.UpdateAnchorTextCount(currTerm, docId, currCount); } } }