public void ConvertTermWeightTable() { DataTable dt = new d.TermDocData().GetAllTermWeights(); string term; string docid; string termweight; string termweight_w; string termweight_a; string termweight_wa; foreach (DataRow dr in dt.Rows) { StringBuilder sb = new StringBuilder(); term = dr[0].ToString(); docid = dr[1].ToString(); termweight = dr[2].ToString(); termweight_w = dr[3].ToString(); termweight_a = dr[4].ToString(); termweight_wa = dr[5].ToString(); sb.AppendFormat("{0} ", term); sb.AppendFormat("{0} ", docid); sb.AppendFormat("{0} ", termweight); sb.AppendFormat("{0} ", termweight_w); sb.AppendFormat("{0} ", termweight_a); sb.AppendFormat("{0}", termweight_wa); Console.WriteLine(sb.ToString()); } }
public TermParser() { stopList = new d.StopList(); delims = parseHelper.GetDelims(); stemmer = new PorterStemmer(); termData = new d.TermData(); DocData = new d.DocData(); termdocData = new d.TermDocData(); parseHelper = new ParseHelper(); }
public void ConvertTermDocTable() { DataTable dt = new d.TermDocData().GetAll(); string term; string docid; string textcount; string boldcount; string headercount; string anchorcount; string titlecount; string urlcount; string externalanchorcount; string totalcount; string totalcount_w; string totalcount_a; string totalcount_wa; foreach (DataRow dr in dt.Rows) { StringBuilder sb = new StringBuilder(); term = dr[0].ToString(); docid = dr[1].ToString(); textcount = dr[2].ToString(); boldcount = dr[3].ToString(); headercount = dr[4].ToString(); anchorcount = dr[5].ToString(); titlecount = dr[6].ToString(); urlcount = dr[7].ToString(); externalanchorcount = dr[8].ToString(); totalcount = dr[9].ToString(); totalcount_w = dr[10].ToString(); totalcount_a = dr[11].ToString(); totalcount_wa = dr[12].ToString(); sb.AppendFormat("{0} ", term); sb.AppendFormat("{0} ", docid); sb.AppendFormat("{0} ", textcount); sb.AppendFormat("{0} ", boldcount); sb.AppendFormat("{0} ", headercount); sb.AppendFormat("{0} ", titlecount); sb.AppendFormat("{0} ", titlecount); sb.AppendFormat("{0} ", urlcount); sb.AppendFormat("{0} ", externalanchorcount); sb.AppendFormat("{0} ", totalcount); sb.AppendFormat("{0} ", totalcount_w); sb.AppendFormat("{0} ", totalcount_a); sb.AppendFormat("{0} ", totalcount_wa); Console.WriteLine(sb.ToString()); } }
public void CalculateTermWeights() { int coeff = 3; d.TermDocData tdd = new d.TermDocData(); DataTable dt = tdd.GetRecords(); string term; int docId; int text; int bold; int heading; int anchor; int title; int url; int externalAnchor; int tf; int tfa; for (int i = 0; i < dt.Rows.Count; i++) { if (i % 10 == 0) { Console.WriteLine(i); } term = dt.Rows[i][0].ToString(); docId = (int)dt.Rows[i][1]; text = (int)dt.Rows[i][2]; bold = coeff * (int)dt.Rows[i][3]; heading = coeff * (int)dt.Rows[i][4]; anchor = (int)dt.Rows[i][5]; title = coeff * (int)dt.Rows[i][6]; url = coeff * (int)dt.Rows[i][7]; externalAnchor = coeff * (int)dt.Rows[i][9]; tf = text + bold + heading + anchor + title + url; tfa = tf + externalAnchor; tdd.UpdateTermFreqs(term, docId, tf, tfa); } }
public void AddAnchorText() { d.StopList stopList = new d.StopList(); ParseHelper parseHelper = new ParseHelper(); char[] delims = parseHelper.GetDelims(); PorterStemmer stemmer = new PorterStemmer(); d.LinkData ld = new d.LinkData(); d.TermDocData tdd = new d.TermDocData(); DataTable linksTable; int docId; StringBuilder sb; string[] terms; string term; Hashtable currTerms; DataTable dt = new d.DocData().GetIds(); for (int i = 0; i < dt.Rows.Count; i++) { if (i % 10 == 0) { Console.WriteLine(i); } //accumulate all link text for this doc into StringBuilder sb = new StringBuilder(); docId = (int)dt.Rows[i][0]; linksTable = ld.GetRecordsByToId(docId); foreach (DataRow dr in linksTable.Rows) { sb.AppendFormat("{0} ", dr[0].ToString()); } //accum terms + counts into currTerms hashtable currTerms = new Hashtable(); terms = sb.ToString().Split(delims); for (int j = 0; j < terms.Length; j++) { term = stemmer.stemTerm(terms[j].ToLower().Trim()); if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term)) { if (!currTerms.Contains(term)) { currTerms.Add(term, 1); } else { currTerms[term] = (int)currTerms[term] + 1; } } } //write terms and counts to database IDictionaryEnumerator en = currTerms.GetEnumerator(); string currTerm; int currCount; while (en.MoveNext()) { currTerm = en.Key.ToString(); currCount = (int)currTerms[currTerm]; tdd.UpdateAnchorTextCount(currTerm, docId, currCount); } } }