Exemple #1
0
 private void writeToDatabase()
 {
     d.DocData dd = new d.DocData();
     for (int i = 0; i < ranks.Length; i++)
     {
         dd.UpdatePageRank(ids[i], ranks[i]);
     }
 }
Exemple #2
0
        private void BindData()
        {
            DataSet ds = new d.DocData().GetRecordsP(makeQuery(), pager.PageSize, pager.CurrentPage, dgrPages.SortExpression);

            dgrPages.DataSource = ds.Tables[0];
            dgrPages.DataBind();
            pager.AdjustAfterBinding(Convert.ToInt32(ds.Tables[1].Rows[0][0]));
        }
Exemple #3
0
        private void load()
        {
            DataTable docs = new d.DocData().GetLinkCounts();

            inboundLinks  = new Hashtable();
            outboundLinks = new Hashtable();

            int id;

            foreach (DataRow dr in docs.Rows)
            {
                id = Convert.ToInt32(dr[0]);
                inboundLinks.Add(id, new int[Convert.ToInt32(dr[1])]);
                outboundLinks.Add(id, new int[Convert.ToInt32(dr[2])]);
            }

            d.LinkData linkData = new d.LinkData();
            DataTable  dt       = linkData.GetLinksSortByTo();

            int currId = -1;
            int cursor = 0;
            int toid;
            int fromid;

            int[] currLinks = null;
            foreach (DataRow dr in dt.Rows)
            {
                toid   = Convert.ToInt32(dr[0]);
                fromid = Convert.ToInt32(dr[1]);

                if (currId < toid)
                {
                    cursor    = 0;
                    currId    = toid;
                    currLinks = (int[])inboundLinks[toid];
                }
                currLinks[cursor++] = fromid;
            }

            dt = linkData.GetLinksSortByFrom();

            currId    = -1;
            cursor    = 0;
            currLinks = null;
            foreach (DataRow dr in dt.Rows)
            {
                fromid = Convert.ToInt32(dr[0]);
                toid   = Convert.ToInt32(dr[1]);
                if (currId < fromid)
                {
                    cursor    = 0;
                    currId    = fromid;
                    currLinks = (int[])outboundLinks[fromid];
                }
                currLinks[cursor++] = toid;
            }
        }
Exemple #4
0
 public TermParser()
 {
     stopList    = new d.StopList();
     delims      = parseHelper.GetDelims();
     stemmer     = new PorterStemmer();
     termData    = new d.TermData();
     DocData     = new d.DocData();
     termdocData = new d.TermDocData();
     parseHelper = new ParseHelper();
 }
Exemple #5
0
        public void ConvertDocTable()
        {
            DataTable dt = new d.DocData().GetAll();
            string    id;
            string    url;
            string    title;
            string    inbound;
            string    outbound;
            string    maxterms;
            string    maxterms_w;
            string    maxterms_a;
            string    maxterms_wa;
            string    norm;
            string    norm_w;
            string    norm_a;
            string    norm_wa;
            string    pagerank;
            string    termcount;

            foreach (DataRow dr in dt.Rows)
            {
                StringBuilder sb = new StringBuilder();
                id          = dr[0].ToString();
                url         = dr[1].ToString();
                title       = dr[2].ToString();
                inbound     = dr[3].ToString();
                outbound    = dr[4].ToString();
                maxterms    = dr[5].ToString();
                maxterms_w  = dr[6].ToString();
                maxterms_a  = dr[7].ToString();
                maxterms_wa = dr[8].ToString();
                norm        = dr[9].ToString();
                norm_w      = dr[10].ToString();
                norm_a      = dr[11].ToString();
                norm_wa     = dr[12].ToString();
                pagerank    = dr[13].ToString();
                termcount   = dr[14].ToString();
                sb.AppendFormat("{0} ", id);
                sb.AppendFormat("{0} ", url);
                sb.AppendFormat("[[{0}]] ", title);
                sb.AppendFormat("{0} ", inbound);
                sb.AppendFormat("{0} ", outbound);
                sb.AppendFormat("{0} ", maxterms);
                sb.AppendFormat("{0} ", maxterms_w);
                sb.AppendFormat("{0} ", maxterms_a);
                sb.AppendFormat("{0} ", maxterms_wa);
                sb.AppendFormat("{0} ", norm);
                sb.AppendFormat("{0} ", norm_w);
                sb.AppendFormat("{0} ", norm_a);
                sb.AppendFormat("{0} ", norm_wa);
                sb.AppendFormat("{0} ", pagerank);
                sb.AppendFormat("{0}", termcount);
                Console.WriteLine(sb.ToString());
            }
        }
Exemple #6
0
        //remove the extra 1.213 files
        public void RemoveExtraPageFiles()
        {
            string path = Helper.PAGES_PATH;

            string[]  files = Directory.GetFiles(path);
            d.DocData pd    = new d.DocData();
            for (int i = 0; i < files.Length; i++)
            {
                if (!pd.PageIdExists(i))
                {
                    File.Delete(path + i + ".html");
                }
            }
        }
Exemple #7
0
        public void Extract(string path)
        {
            d.DocData pd = new d.DocData();

            string[] files = Directory.GetFiles(path);
            for (int i = 0; i < files.Length; i++)
            {
                string   title  = extractTitle(loadFile(files[i]));
                FileInfo fi     = new FileInfo(files[i]);
                int      pageId = Convert.ToInt32(fi.Name.Substring(0, fi.Name.IndexOf(".")));
                pd.UpdateTitle(pageId, title);

                if (i % 1000 == 0)
                {
                    Console.WriteLine("processing file #" + i);
                }
            }
        }
Exemple #8
0
        private void init()
        {
            DataTable docIds   = new d.DocData().GetIds();
            int       doccount = docIds.Rows.Count;
            float     initRank = 1f / doccount;
            LinkIndex index    = new LinkIndex();

            inboundLinks = new Hashtable();
            ranks        = new float[doccount];
            ids          = new int[doccount];

            int i = 0;
            int docid;

            int[] inboundArray;
            foreach (DataRow dr in docIds.Rows)
            {
                docid = Convert.ToInt32(dr[0]);

                ranks[i] = initRank;                    //load initial pagerank value
                ids[i]   = docid;                       //load docid into position i
                i++;

                Hashtable currInbounds = new Hashtable();                       //make hashtable for current docid
                inboundLinks.Add(docid, currInbounds);                          //store this hashtable of inboundlinks for current docid
                inboundArray = index.GetInboundLinks(docid);                    //get inbound links array for current docid
                foreach (int fromid in inboundArray)                            //store it in this hashtable
                {
                    currInbounds.Add(fromid, true);
                }
            }

            outboundLinkCouns = new Hashtable();
            DataTable linkCounts = new d.DocData().GetLinkCounts();
            int       countTo;

            foreach (DataRow dr in linkCounts.Rows)
            {
                docid   = Convert.ToInt32(dr[0]);
                countTo = Convert.ToInt32(dr[2]);
                outboundLinkCouns.Add(docid, countTo);
            }
        }
Exemple #9
0
        public void Run()
        {
            UrlHelper       urlHelper = new UrlHelper();
            Regex           regex     = makeRegex();
            MatchCollection mc;

            Regex rSpace = new Regex(@"\s");

            d.DocData  pd = new d.DocData();
            d.LinkData ld = new d.LinkData();

            Uri      baseUri;
            Uri      childUri;
            FileInfo fi;

            d.Doc        p;
            int          pageId;
            StreamReader sr;
            string       html;
            string       linkToProcess;
            int          linkId;
            string       linkText;

            string path = Helper.DOCS_PATH;

            string[] files = Directory.GetFiles(path);

            for (int i = 545; i < files.Length; i++)            //545 already done
            {
                Console.WriteLine("processing file #" + i);

                fi      = new FileInfo(files[i]);
                pageId  = Convert.ToInt32(fi.Name.Substring(0, fi.Name.IndexOf(".")));
                p       = new d.Doc(pageId);
                baseUri = new Uri(p.Url);

                sr   = new StreamReader(fi.OpenRead());
                html = sr.ReadToEnd();
                mc   = regex.Matches(html);

                Console.WriteLine("found " + mc.Count + " links");

                foreach (Match m in mc)
                {
                    try
                    {
                        childUri      = new Uri(baseUri, urlHelper.MakeLink(m.Groups[3].ToString()));
                        linkToProcess = urlHelper.NormalizeUrl(childUri.AbsoluteUri);
                        linkText      = m.Groups[4].ToString();

                        linkId = pd.GetIdByUrl(linkToProcess);
                        if (linkId > 0 && linkText != "")                         //found page!
                        {
                            linkText = rSpace.Replace(linkText, " ");
                            linkText = linkText.Trim();
                            linkText = linkText.Replace("          ", " ");
                            linkText = linkText.Replace("         ", " ");
                            linkText = linkText.Replace("        ", " ");
                            linkText = linkText.Replace("       ", " ");
                            linkText = linkText.Replace("      ", " ");
                            linkText = linkText.Replace("     ", " ");
                            linkText = linkText.Replace("    ", " ");
                            linkText = linkText.Replace("   ", " ");
                            linkText = linkText.Replace("  ", " ");
                            ld.UpdateText(pageId, linkId, linkText);
                        }
                    }
                    catch (Exception) {}
                }

                //if (i % 100 == 0)
                //	Console.WriteLine("processing file #" + i);
            }
        }
Exemple #10
0
        public void AddAnchorText()
        {
            d.StopList  stopList    = new d.StopList();
            ParseHelper parseHelper = new ParseHelper();

            char[]        delims  = parseHelper.GetDelims();
            PorterStemmer stemmer = new PorterStemmer();

            d.LinkData    ld  = new d.LinkData();
            d.TermDocData tdd = new d.TermDocData();

            DataTable     linksTable;
            int           docId;
            StringBuilder sb;

            string[]  terms;
            string    term;
            Hashtable currTerms;
            DataTable dt = new d.DocData().GetIds();

            for (int i = 0; i < dt.Rows.Count; i++)
            {
                if (i % 10 == 0)
                {
                    Console.WriteLine(i);
                }

                //accumulate all link text for this doc into StringBuilder
                sb         = new StringBuilder();
                docId      = (int)dt.Rows[i][0];
                linksTable = ld.GetRecordsByToId(docId);
                foreach (DataRow dr in linksTable.Rows)
                {
                    sb.AppendFormat("{0} ", dr[0].ToString());
                }

                //accum terms + counts into currTerms hashtable
                currTerms = new Hashtable();
                terms     = sb.ToString().Split(delims);
                for (int j = 0; j < terms.Length; j++)
                {
                    term = stemmer.stemTerm(terms[j].ToLower().Trim());
                    if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term))
                    {
                        if (!currTerms.Contains(term))
                        {
                            currTerms.Add(term, 1);
                        }
                        else
                        {
                            currTerms[term] = (int)currTerms[term] + 1;
                        }
                    }
                }

                //write terms and counts to database
                IDictionaryEnumerator en = currTerms.GetEnumerator();
                string currTerm;
                int    currCount;
                while (en.MoveNext())
                {
                    currTerm  = en.Key.ToString();
                    currCount = (int)currTerms[currTerm];
                    tdd.UpdateAnchorTextCount(currTerm, docId, currCount);
                }
            }
        }
Exemple #11
0
        private void execAH()
        {
            pnlSearch.Visible = false;
            pnlAH.Visible     = true;

            id.QueryVector qv = new id.QueryVector(filterInput(tbxSearch.Text));

            float w = 1.0f;

            try { w = Convert.ToSingle(tbxPageRank.Text); }
            catch (Exception) { }

            DataSet   ds          = new d.SearchData().GetSearchResults(qv.QueryTerms, qv.QueryWeights, w);
            DataTable dt          = ds.Tables[0];
            int       resultcount = Convert.ToInt32(ds.Tables[1].Rows[0][0]);

            if (resultcount > 0)
            {
                int rootSize       = 50; //Convert.ToInt32(tbxRoot.Text);
                int maxParents     = 20; //Convert.ToInt32(tbxParents.Text);
                int maxChildren    = 20; //Convert.ToInt32(tbxChildren.Text);
                int displayResults = 50;

                int[] resultIds = new int[resultcount];
                int   j         = 0;
                foreach (DataRow dr in dt.Rows)
                {
                    resultIds[j++] = Convert.ToInt32(dr[0]);
                }

                id.AHPageLoader ahl         = new id.AHPageLoader(resultIds, rootSize, maxParents, maxChildren);
                id.AHDocument[] authorities = ahl.Authorities;

                int max = Math.Min(authorities.Length, Convert.ToInt32(displayResults));

                StringBuilder sb = new StringBuilder();
                sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. " +
                                "Displaying the top {1} authorities:</p>", resultcount, max);
                sb.Append("<table style='font-family:Verdana;font-size:10pt;' cellpadding='3' cellspacing='2' width='100%'>");
                sb.Append("<tr bgcolor='#f1f1f1'><td>authority score</td><td>hub score</td><td>document</td></tr>");

                d.DocData dd = new d.DocData();
                DataTable docData;
                for (int i = 0; i < max; i++)
                {
                    int docId = authorities[i].DocId;

                    docData = dd.GetDocData(docId);
                    string url = docData.Rows[0][0].ToString();
                    sb.AppendFormat("<tr><td>{0}</td><td>{1}</td><td><p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{2}'>{3}</a>",
                                    authorities[i].AuthorityScore, authorities[i].HubScore, url, docData.Rows[0][1].ToString());
                    sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a></td></tr>", url);
                }
                sb.Append("</table>");
                ltrAHResult.Text = sb.ToString();
            }
            else
            {
                ltrAHResult.Text = "There were no results";
            }
        }