예제 #1
0
        public void ConvertTermWeightTable()
        {
            DataTable dt = new d.TermDocData().GetAllTermWeights();
            string    term;
            string    docid;
            string    termweight;
            string    termweight_w;
            string    termweight_a;
            string    termweight_wa;

            foreach (DataRow dr in dt.Rows)
            {
                StringBuilder sb = new StringBuilder();
                term          = dr[0].ToString();
                docid         = dr[1].ToString();
                termweight    = dr[2].ToString();
                termweight_w  = dr[3].ToString();
                termweight_a  = dr[4].ToString();
                termweight_wa = dr[5].ToString();
                sb.AppendFormat("{0} ", term);
                sb.AppendFormat("{0} ", docid);
                sb.AppendFormat("{0} ", termweight);
                sb.AppendFormat("{0} ", termweight_w);
                sb.AppendFormat("{0} ", termweight_a);
                sb.AppendFormat("{0}", termweight_wa);
                Console.WriteLine(sb.ToString());
            }
        }
예제 #2
0
파일: TermParser.cs 프로젝트: ic4f/oldcode
 public TermParser()
 {
     stopList    = new d.StopList();
     delims      = parseHelper.GetDelims();
     stemmer     = new PorterStemmer();
     termData    = new d.TermData();
     DocData     = new d.DocData();
     termdocData = new d.TermDocData();
     parseHelper = new ParseHelper();
 }
예제 #3
0
        public void ConvertTermDocTable()
        {
            DataTable dt = new d.TermDocData().GetAll();
            string    term;
            string    docid;
            string    textcount;
            string    boldcount;
            string    headercount;
            string    anchorcount;
            string    titlecount;
            string    urlcount;
            string    externalanchorcount;
            string    totalcount;
            string    totalcount_w;
            string    totalcount_a;
            string    totalcount_wa;

            foreach (DataRow dr in dt.Rows)
            {
                StringBuilder sb = new StringBuilder();
                term                = dr[0].ToString();
                docid               = dr[1].ToString();
                textcount           = dr[2].ToString();
                boldcount           = dr[3].ToString();
                headercount         = dr[4].ToString();
                anchorcount         = dr[5].ToString();
                titlecount          = dr[6].ToString();
                urlcount            = dr[7].ToString();
                externalanchorcount = dr[8].ToString();
                totalcount          = dr[9].ToString();
                totalcount_w        = dr[10].ToString();
                totalcount_a        = dr[11].ToString();
                totalcount_wa       = dr[12].ToString();
                sb.AppendFormat("{0} ", term);
                sb.AppendFormat("{0} ", docid);
                sb.AppendFormat("{0} ", textcount);
                sb.AppendFormat("{0} ", boldcount);
                sb.AppendFormat("{0} ", headercount);
                sb.AppendFormat("{0} ", titlecount);
                sb.AppendFormat("{0} ", titlecount);
                sb.AppendFormat("{0} ", urlcount);
                sb.AppendFormat("{0} ", externalanchorcount);
                sb.AppendFormat("{0} ", totalcount);
                sb.AppendFormat("{0} ", totalcount_w);
                sb.AppendFormat("{0} ", totalcount_a);
                sb.AppendFormat("{0} ", totalcount_wa);
                Console.WriteLine(sb.ToString());
            }
        }
예제 #4
0
        public void CalculateTermWeights()
        {
            int coeff = 3;

            d.TermDocData tdd = new d.TermDocData();
            DataTable     dt  = tdd.GetRecords();
            string        term;
            int           docId;
            int           text;
            int           bold;
            int           heading;
            int           anchor;
            int           title;
            int           url;
            int           externalAnchor;
            int           tf;
            int           tfa;

            for (int i = 0; i < dt.Rows.Count; i++)
            {
                if (i % 10 == 0)
                {
                    Console.WriteLine(i);
                }

                term           = dt.Rows[i][0].ToString();
                docId          = (int)dt.Rows[i][1];
                text           = (int)dt.Rows[i][2];
                bold           = coeff * (int)dt.Rows[i][3];
                heading        = coeff * (int)dt.Rows[i][4];
                anchor         = (int)dt.Rows[i][5];
                title          = coeff * (int)dt.Rows[i][6];
                url            = coeff * (int)dt.Rows[i][7];
                externalAnchor = coeff * (int)dt.Rows[i][9];

                tf  = text + bold + heading + anchor + title + url;
                tfa = tf + externalAnchor;

                tdd.UpdateTermFreqs(term, docId, tf, tfa);
            }
        }
예제 #5
0
        public void AddAnchorText()
        {
            d.StopList  stopList    = new d.StopList();
            ParseHelper parseHelper = new ParseHelper();

            char[]        delims  = parseHelper.GetDelims();
            PorterStemmer stemmer = new PorterStemmer();

            d.LinkData    ld  = new d.LinkData();
            d.TermDocData tdd = new d.TermDocData();

            DataTable     linksTable;
            int           docId;
            StringBuilder sb;

            string[]  terms;
            string    term;
            Hashtable currTerms;
            DataTable dt = new d.DocData().GetIds();

            for (int i = 0; i < dt.Rows.Count; i++)
            {
                if (i % 10 == 0)
                {
                    Console.WriteLine(i);
                }

                //accumulate all link text for this doc into StringBuilder
                sb         = new StringBuilder();
                docId      = (int)dt.Rows[i][0];
                linksTable = ld.GetRecordsByToId(docId);
                foreach (DataRow dr in linksTable.Rows)
                {
                    sb.AppendFormat("{0} ", dr[0].ToString());
                }

                //accum terms + counts into currTerms hashtable
                currTerms = new Hashtable();
                terms     = sb.ToString().Split(delims);
                for (int j = 0; j < terms.Length; j++)
                {
                    term = stemmer.stemTerm(terms[j].ToLower().Trim());
                    if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term))
                    {
                        if (!currTerms.Contains(term))
                        {
                            currTerms.Add(term, 1);
                        }
                        else
                        {
                            currTerms[term] = (int)currTerms[term] + 1;
                        }
                    }
                }

                //write terms and counts to database
                IDictionaryEnumerator en = currTerms.GetEnumerator();
                string currTerm;
                int    currCount;
                while (en.MoveNext())
                {
                    currTerm  = en.Key.ToString();
                    currCount = (int)currTerms[currTerm];
                    tdd.UpdateAnchorTextCount(currTerm, docId, currCount);
                }
            }
        }