public void AddAnchorText() { d.StopList stopList = new d.StopList(); ParseHelper parseHelper = new ParseHelper(); char[] delims = parseHelper.GetDelims(); PorterStemmer stemmer = new PorterStemmer(); d.LinkData ld = new d.LinkData(); d.TermDocData tdd = new d.TermDocData(); DataTable linksTable; int docId; StringBuilder sb; string[] terms; string term; Hashtable currTerms; DataTable dt = new d.DocData().GetIds(); for (int i = 0; i < dt.Rows.Count; i++) { if (i % 10 == 0) { Console.WriteLine(i); } //accumulate all link text for this doc into StringBuilder sb = new StringBuilder(); docId = (int)dt.Rows[i][0]; linksTable = ld.GetRecordsByToId(docId); foreach (DataRow dr in linksTable.Rows) { sb.AppendFormat("{0} ", dr[0].ToString()); } //accum terms + counts into currTerms hashtable currTerms = new Hashtable(); terms = sb.ToString().Split(delims); for (int j = 0; j < terms.Length; j++) { term = stemmer.stemTerm(terms[j].ToLower().Trim()); if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term)) { if (!currTerms.Contains(term)) { currTerms.Add(term, 1); } else { currTerms[term] = (int)currTerms[term] + 1; } } } //write terms and counts to database IDictionaryEnumerator en = currTerms.GetEnumerator(); string currTerm; int currCount; while (en.MoveNext()) { currTerm = en.Key.ToString(); currCount = (int)currTerms[currTerm]; tdd.UpdateAnchorTextCount(currTerm, docId, currCount); } } }
private void accumTerms(string input, int termType) { textTerms = input.Split(delims); for (int i = 0; i < textTerms.Length; i++) { term = stemmer.stemTerm(textTerms[i].ToLower().Trim()); if (term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term)) //only allow terms < 25 chars { TermCollector tc; if (termCollectors.Contains(term)) { tc = (TermCollector)termCollectors[term]; } else { tc = new TermCollector(term); termCollectors.Add(term, tc); } tc.IncrementCount(termType); } } return; }