private void writeToDatabase() { d.DocData dd = new d.DocData(); for (int i = 0; i < ranks.Length; i++) { dd.UpdatePageRank(ids[i], ranks[i]); } }
private void BindData() { DataSet ds = new d.DocData().GetRecordsP(makeQuery(), pager.PageSize, pager.CurrentPage, dgrPages.SortExpression); dgrPages.DataSource = ds.Tables[0]; dgrPages.DataBind(); pager.AdjustAfterBinding(Convert.ToInt32(ds.Tables[1].Rows[0][0])); }
private void load() { DataTable docs = new d.DocData().GetLinkCounts(); inboundLinks = new Hashtable(); outboundLinks = new Hashtable(); int id; foreach (DataRow dr in docs.Rows) { id = Convert.ToInt32(dr[0]); inboundLinks.Add(id, new int[Convert.ToInt32(dr[1])]); outboundLinks.Add(id, new int[Convert.ToInt32(dr[2])]); } d.LinkData linkData = new d.LinkData(); DataTable dt = linkData.GetLinksSortByTo(); int currId = -1; int cursor = 0; int toid; int fromid; int[] currLinks = null; foreach (DataRow dr in dt.Rows) { toid = Convert.ToInt32(dr[0]); fromid = Convert.ToInt32(dr[1]); if (currId < toid) { cursor = 0; currId = toid; currLinks = (int[])inboundLinks[toid]; } currLinks[cursor++] = fromid; } dt = linkData.GetLinksSortByFrom(); currId = -1; cursor = 0; currLinks = null; foreach (DataRow dr in dt.Rows) { fromid = Convert.ToInt32(dr[0]); toid = Convert.ToInt32(dr[1]); if (currId < fromid) { cursor = 0; currId = fromid; currLinks = (int[])outboundLinks[fromid]; } currLinks[cursor++] = toid; } }
public TermParser() { stopList = new d.StopList(); delims = parseHelper.GetDelims(); stemmer = new PorterStemmer(); termData = new d.TermData(); DocData = new d.DocData(); termdocData = new d.TermDocData(); parseHelper = new ParseHelper(); }
public void ConvertDocTable() { DataTable dt = new d.DocData().GetAll(); string id; string url; string title; string inbound; string outbound; string maxterms; string maxterms_w; string maxterms_a; string maxterms_wa; string norm; string norm_w; string norm_a; string norm_wa; string pagerank; string termcount; foreach (DataRow dr in dt.Rows) { StringBuilder sb = new StringBuilder(); id = dr[0].ToString(); url = dr[1].ToString(); title = dr[2].ToString(); inbound = dr[3].ToString(); outbound = dr[4].ToString(); maxterms = dr[5].ToString(); maxterms_w = dr[6].ToString(); maxterms_a = dr[7].ToString(); maxterms_wa = dr[8].ToString(); norm = dr[9].ToString(); norm_w = dr[10].ToString(); norm_a = dr[11].ToString(); norm_wa = dr[12].ToString(); pagerank = dr[13].ToString(); termcount = dr[14].ToString(); sb.AppendFormat("{0} ", id); sb.AppendFormat("{0} ", url); sb.AppendFormat("[[{0}]] ", title); sb.AppendFormat("{0} ", inbound); sb.AppendFormat("{0} ", outbound); sb.AppendFormat("{0} ", maxterms); sb.AppendFormat("{0} ", maxterms_w); sb.AppendFormat("{0} ", maxterms_a); sb.AppendFormat("{0} ", maxterms_wa); sb.AppendFormat("{0} ", norm); sb.AppendFormat("{0} ", norm_w); sb.AppendFormat("{0} ", norm_a); sb.AppendFormat("{0} ", norm_wa); sb.AppendFormat("{0} ", pagerank); sb.AppendFormat("{0}", termcount); Console.WriteLine(sb.ToString()); } }
//remove the extra 1.213 files public void RemoveExtraPageFiles() { string path = Helper.PAGES_PATH; string[] files = Directory.GetFiles(path); d.DocData pd = new d.DocData(); for (int i = 0; i < files.Length; i++) { if (!pd.PageIdExists(i)) { File.Delete(path + i + ".html"); } } }
public void Extract(string path) { d.DocData pd = new d.DocData(); string[] files = Directory.GetFiles(path); for (int i = 0; i < files.Length; i++) { string title = extractTitle(loadFile(files[i])); FileInfo fi = new FileInfo(files[i]); int pageId = Convert.ToInt32(fi.Name.Substring(0, fi.Name.IndexOf("."))); pd.UpdateTitle(pageId, title); if (i % 1000 == 0) { Console.WriteLine("processing file #" + i); } } }
private void init() { DataTable docIds = new d.DocData().GetIds(); int doccount = docIds.Rows.Count; float initRank = 1f / doccount; LinkIndex index = new LinkIndex(); inboundLinks = new Hashtable(); ranks = new float[doccount]; ids = new int[doccount]; int i = 0; int docid; int[] inboundArray; foreach (DataRow dr in docIds.Rows) { docid = Convert.ToInt32(dr[0]); ranks[i] = initRank; //load initial pagerank value ids[i] = docid; //load docid into position i i++; Hashtable currInbounds = new Hashtable(); //make hashtable for current docid inboundLinks.Add(docid, currInbounds); //store this hashtable of inboundlinks for current docid inboundArray = index.GetInboundLinks(docid); //get inbound links array for current docid foreach (int fromid in inboundArray) //store it in this hashtable { currInbounds.Add(fromid, true); } } outboundLinkCouns = new Hashtable(); DataTable linkCounts = new d.DocData().GetLinkCounts(); int countTo; foreach (DataRow dr in linkCounts.Rows) { docid = Convert.ToInt32(dr[0]); countTo = Convert.ToInt32(dr[2]); outboundLinkCouns.Add(docid, countTo); } }
public void Run() { UrlHelper urlHelper = new UrlHelper(); Regex regex = makeRegex(); MatchCollection mc; Regex rSpace = new Regex(@"\s"); d.DocData pd = new d.DocData(); d.LinkData ld = new d.LinkData(); Uri baseUri; Uri childUri; FileInfo fi; d.Doc p; int pageId; StreamReader sr; string html; string linkToProcess; int linkId; string linkText; string path = Helper.DOCS_PATH; string[] files = Directory.GetFiles(path); for (int i = 545; i < files.Length; i++) //545 already done { Console.WriteLine("processing file #" + i); fi = new FileInfo(files[i]); pageId = Convert.ToInt32(fi.Name.Substring(0, fi.Name.IndexOf("."))); p = new d.Doc(pageId); baseUri = new Uri(p.Url); sr = new StreamReader(fi.OpenRead()); html = sr.ReadToEnd(); mc = regex.Matches(html); Console.WriteLine("found " + mc.Count + " links"); foreach (Match m in mc) { try { childUri = new Uri(baseUri, urlHelper.MakeLink(m.Groups[3].ToString())); linkToProcess = urlHelper.NormalizeUrl(childUri.AbsoluteUri); linkText = m.Groups[4].ToString(); linkId = pd.GetIdByUrl(linkToProcess); if (linkId > 0 && linkText != "") //found page! { linkText = rSpace.Replace(linkText, " "); linkText = linkText.Trim(); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); ld.UpdateText(pageId, linkId, linkText); } } catch (Exception) {} } //if (i % 100 == 0) // Console.WriteLine("processing file #" + i); } }
public void AddAnchorText() { d.StopList stopList = new d.StopList(); ParseHelper parseHelper = new ParseHelper(); char[] delims = parseHelper.GetDelims(); PorterStemmer stemmer = new PorterStemmer(); d.LinkData ld = new d.LinkData(); d.TermDocData tdd = new d.TermDocData(); DataTable linksTable; int docId; StringBuilder sb; string[] terms; string term; Hashtable currTerms; DataTable dt = new d.DocData().GetIds(); for (int i = 0; i < dt.Rows.Count; i++) { if (i % 10 == 0) { Console.WriteLine(i); } //accumulate all link text for this doc into StringBuilder sb = new StringBuilder(); docId = (int)dt.Rows[i][0]; linksTable = ld.GetRecordsByToId(docId); foreach (DataRow dr in linksTable.Rows) { sb.AppendFormat("{0} ", dr[0].ToString()); } //accum terms + counts into currTerms hashtable currTerms = new Hashtable(); terms = sb.ToString().Split(delims); for (int j = 0; j < terms.Length; j++) { term = stemmer.stemTerm(terms[j].ToLower().Trim()); if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term)) { if (!currTerms.Contains(term)) { currTerms.Add(term, 1); } else { currTerms[term] = (int)currTerms[term] + 1; } } } //write terms and counts to database IDictionaryEnumerator en = currTerms.GetEnumerator(); string currTerm; int currCount; while (en.MoveNext()) { currTerm = en.Key.ToString(); currCount = (int)currTerms[currTerm]; tdd.UpdateAnchorTextCount(currTerm, docId, currCount); } } }
private void execAH() { pnlSearch.Visible = false; pnlAH.Visible = true; id.QueryVector qv = new id.QueryVector(filterInput(tbxSearch.Text)); float w = 1.0f; try { w = Convert.ToSingle(tbxPageRank.Text); } catch (Exception) { } DataSet ds = new d.SearchData().GetSearchResults(qv.QueryTerms, qv.QueryWeights, w); DataTable dt = ds.Tables[0]; int resultcount = Convert.ToInt32(ds.Tables[1].Rows[0][0]); if (resultcount > 0) { int rootSize = 50; //Convert.ToInt32(tbxRoot.Text); int maxParents = 20; //Convert.ToInt32(tbxParents.Text); int maxChildren = 20; //Convert.ToInt32(tbxChildren.Text); int displayResults = 50; int[] resultIds = new int[resultcount]; int j = 0; foreach (DataRow dr in dt.Rows) { resultIds[j++] = Convert.ToInt32(dr[0]); } id.AHPageLoader ahl = new id.AHPageLoader(resultIds, rootSize, maxParents, maxChildren); id.AHDocument[] authorities = ahl.Authorities; int max = Math.Min(authorities.Length, Convert.ToInt32(displayResults)); StringBuilder sb = new StringBuilder(); sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. " + "Displaying the top {1} authorities:</p>", resultcount, max); sb.Append("<table style='font-family:Verdana;font-size:10pt;' cellpadding='3' cellspacing='2' width='100%'>"); sb.Append("<tr bgcolor='#f1f1f1'><td>authority score</td><td>hub score</td><td>document</td></tr>"); d.DocData dd = new d.DocData(); DataTable docData; for (int i = 0; i < max; i++) { int docId = authorities[i].DocId; docData = dd.GetDocData(docId); string url = docData.Rows[0][0].ToString(); sb.AppendFormat("<tr><td>{0}</td><td>{1}</td><td><p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{2}'>{3}</a>", authorities[i].AuthorityScore, authorities[i].HubScore, url, docData.Rows[0][1].ToString()); sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a></td></tr>", url); } sb.Append("</table>"); ltrAHResult.Text = sb.ToString(); } else { ltrAHResult.Text = "There were no results"; } }