Beispiel #1
0
        private void loadText(string text)
        {
            d.StopList    stoplist = new d.StopList();
            PorterStemmer stemmer  = new PorterStemmer();
            ParseHelper   ph       = new ParseHelper();

            text = text.ToLower();
            char[]    delims = ph.GetDelims();
            string[]  temp   = text.Split(delims);
            Hashtable terms  = new Hashtable();
            string    term;

            for (int i = 0; i < temp.Length; i++)
            {
                term = stemmer.stemTerm(temp[i]);
                if (!stoplist.Contains(term))
                {
                    if (terms.Contains(term))
                    {
                        terms[term] = Convert.ToInt32(terms[term]) + 1;
                    }
                    else
                    {
                        terms.Add(term, 1);
                    }
                }
            }

            StringBuilder         sbt = new StringBuilder();
            StringBuilder         sbw = new StringBuilder();
            IDictionaryEnumerator en  = terms.GetEnumerator();
            string t;
            string w;

            while (en.MoveNext())
            {
                t = en.Key.ToString();
                w = terms[t].ToString();
                sbt.AppendFormat("{0},", t);
                sbw.AppendFormat("{0},", w);
            }
            queryTerms   = sbt.ToString();
            queryWeights = sbw.ToString();
        }
Beispiel #2
0
 private void accumTerms(string input, int termType)
 {
     textTerms = input.Split(delims);
     for (int i = 0; i < textTerms.Length; i++)
     {
         term = stemmer.stemTerm(textTerms[i].ToLower().Trim());
         if (term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term))                 //only allow terms < 25 chars
         {
             TermCollector tc;
             if (termCollectors.Contains(term))
             {
                 tc = (TermCollector)termCollectors[term];
             }
             else
             {
                 tc = new TermCollector(term);
                 termCollectors.Add(term, tc);
             }
             tc.IncrementCount(termType);
         }
     }
     return;
 }
Beispiel #3
0
        public void AddAnchorText()
        {
            d.StopList  stopList    = new d.StopList();
            ParseHelper parseHelper = new ParseHelper();

            char[]        delims  = parseHelper.GetDelims();
            PorterStemmer stemmer = new PorterStemmer();

            d.LinkData    ld  = new d.LinkData();
            d.TermDocData tdd = new d.TermDocData();

            DataTable     linksTable;
            int           docId;
            StringBuilder sb;

            string[]  terms;
            string    term;
            Hashtable currTerms;
            DataTable dt = new d.DocData().GetIds();

            for (int i = 0; i < dt.Rows.Count; i++)
            {
                if (i % 10 == 0)
                {
                    Console.WriteLine(i);
                }

                //accumulate all link text for this doc into StringBuilder
                sb         = new StringBuilder();
                docId      = (int)dt.Rows[i][0];
                linksTable = ld.GetRecordsByToId(docId);
                foreach (DataRow dr in linksTable.Rows)
                {
                    sb.AppendFormat("{0} ", dr[0].ToString());
                }

                //accum terms + counts into currTerms hashtable
                currTerms = new Hashtable();
                terms     = sb.ToString().Split(delims);
                for (int j = 0; j < terms.Length; j++)
                {
                    term = stemmer.stemTerm(terms[j].ToLower().Trim());
                    if (term != "home" && term.Length > 0 && term.Length < 25 && !stopList.Contains(term) && parseHelper.IsAsciiLetters(term))
                    {
                        if (!currTerms.Contains(term))
                        {
                            currTerms.Add(term, 1);
                        }
                        else
                        {
                            currTerms[term] = (int)currTerms[term] + 1;
                        }
                    }
                }

                //write terms and counts to database
                IDictionaryEnumerator en = currTerms.GetEnumerator();
                string currTerm;
                int    currCount;
                while (en.MoveNext())
                {
                    currTerm  = en.Key.ToString();
                    currCount = (int)currTerms[currTerm];
                    tdd.UpdateAnchorTextCount(currTerm, docId, currCount);
                }
            }
        }