Пример #1
0
        public void GetNewTerms()
        {
            d.Index   index = new d.Index(d.Helper.INDEX_DIRECTORY_NAME);
            Hashtable terms = index.TermIds;

            string       path = d.Helper.SOURCE_DIRECTORY_NAME + "myterms.txt";
            FileStream   fs   = new FileStream(path, FileMode.Open);
            StreamReader r    = new StreamReader(fs, System.Text.Encoding.ASCII);
            string       line;
            string       term;
            Hashtable    newTerms = new Hashtable();

            while ((line = r.ReadLine()) != null)
            {
                term = line.Substring(0, line.IndexOf(' '));
                if (!terms.Contains(term))
                {
                    newTerms.Add(term, true);
                }
            }
            r.Close();
            fs.Close();

            Console.WriteLine("_________________new terms = " + newTerms.Count);
            IDictionaryEnumerator en = newTerms.GetEnumerator();

            while (en.MoveNext())
            {
                Console.WriteLine(en.Key);
            }
        }
Пример #2
0
        private void execSearch()
        {
            float w = Convert.ToSingle(tbxPagerank.Text);

            d.Index            index    = new d.Index(getIndexDir());
            d.VSSearcher       searcher = new d.VSSearcher(index, w);
            d.ResultDocument[] results  = searcher.Search(tbxQuery.Text.Trim());
            if (results != null && results.Length > 0)
            {
                StringBuilder sb = new StringBuilder();
                sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. Displaying the top {1}:</p>", results.Length, tbxDisplay.Text);
                sb.Append("<table style='font-family:Verdana;font-size:10pt;' cellpadding='3' cellspacing='2' width='100%'>");
                sb.Append("<tr bgcolor='#f1f1f1'><td>#</td><td>similarity</td><td>pagerank</td><td>document</td></tr>");
                int max = Math.Min(results.Length, Convert.ToInt32(tbxDisplay.Text));
                for (int i = 0; i < max; i++)
                {
                    int    docId = results[i].DocId;
                    string url   = "http://" + index.GetURL(docId).Replace("%%", "/");
                    sb.AppendFormat("<tr><td>{4}</td><td>{0}</td><td>{1}</td><td><p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{2}'>{3}</a>",
                                    results[i].Similarity, results[i].PageRank, url, index.GetTitle(docId), i + 1);
                    sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a></td></tr>", url);
                }
                sb.Append("</table>");
                ltrResults.Text = sb.ToString();
            }
            else
            {
                ltrResults.Text = "There were no results";
            }
        }
Пример #3
0
        public void SetUp()
        {
            string sourcePath = @"C:\_current\development\data\isr\testindex\source\";
            string indexPath  = @"C:\_current\development\data\isr\testindex\index\";

            i.DataLoader   dataLoader   = new i.DataLoader(sourcePath);
            i.IndexBuilder indexBuilder = new i.IndexBuilder(dataLoader, indexPath);
            indexBuilder.BuildIndex();
            index = new d.Index(indexPath);
        }
Пример #4
0
        public void GetMissedTerms()
        {
            string       path = d.Helper.SOURCE_DIRECTORY_NAME + "myterms.txt";
            FileStream   fs   = new FileStream(path, FileMode.Open);
            StreamReader r    = new StreamReader(fs, System.Text.Encoding.ASCII);
            string       line;
            Hashtable    myterms = new Hashtable();

            while ((line = r.ReadLine()) != null)
            {
                myterms.Add(line.Substring(0, line.IndexOf(' ')), true);
            }
            r.Close();
            fs.Close();

            Console.WriteLine("_________________my terms = " + myterms.Count);

            d.Index   index = new d.Index(d.Helper.INDEX_DIRECTORY_NAME);
            Hashtable terms = index.TermIds;

            Hashtable missedterms = new Hashtable();


            IDictionaryEnumerator en = terms.GetEnumerator();

            while (en.MoveNext())
            {
                if (!myterms.Contains(en.Key))
                {
                    missedterms.Add(en.Key, true);
                }
            }

            Console.WriteLine("_________________missed terms = " + missedterms.Count);
            IDictionaryEnumerator en2 = missedterms.GetEnumerator();

            while (en2.MoveNext())
            {
                Console.WriteLine(en2.Key);
            }
        }
Пример #5
0
        private void execAuthorities()
        {
            float w = Convert.ToSingle(tbxPagerank.Text);

            d.Index            index    = new d.Index(getIndexDir());
            d.VSSearcher       searcher = new d.VSSearcher(index, w);
            d.ResultDocument[] results  = searcher.Search(tbxQuery.Text.Trim());
            if (results != null && results.Length > 0)
            {
                StringBuilder sb = new StringBuilder();

                int            rootSize    = Convert.ToInt32(tbxRoot.Text);
                int            maxParents  = Convert.ToInt32(tbxParents.Text);
                int            maxChildren = Convert.ToInt32(tbxChildren.Text);
                d.AHPageLoader ahl         = new d.AHPageLoader(index, results, rootSize, maxParents, maxChildren);
                d.AHDocument[] authorities = ahl.Authorities;

                int max = Math.Min(authorities.Length, Convert.ToInt32(tbxDisplay.Text));

                sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. " +
                                "Displaying the top {1} authorities:</p>", results.Length, max);
                sb.Append("<table style='font-family:Verdana;font-size:10pt;' cellpadding='3' cellspacing='2' width='100%'>");
                sb.Append("<tr bgcolor='#f1f1f1'><td>authority score</td><td>hub score</td><td>document</td></tr>");

                for (int i = 0; i < max; i++)
                {
                    int    docId = authorities[i].DocId;
                    string url   = "http://" + index.GetURL(docId).Replace("%%", "/");
                    sb.AppendFormat("<tr><td>{0}</td><td>{1}</td><td><p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{2}'>{3}</a>",
                                    authorities[i].AuthorityScore, authorities[i].HubScore, url, index.GetTitle(docId));
                    sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a></td></tr>", url);
                }
                sb.Append("</table>");
                ltrResults.Text = sb.ToString();
            }
            else
            {
                ltrResults.Text = "There were no results";
            }
        }
Пример #6
0
        private void execCluster()
        {
            float w = Convert.ToSingle(tbxPagerank.Text);

            d.Index            index    = new d.Index(getIndexDir());
            d.VSSearcher       searcher = new d.VSSearcher(index, w);
            d.ResultDocument[] results  = searcher.Search(tbxQuery.Text.Trim());
            if (results != null && results.Length > 0)
            {
                int toCluster = Convert.ToInt32(tbxDocs.Text);
                toCluster = Math.Min(toCluster, results.Length);
                short[] docIds = new short[toCluster];
                for (int i = 0; i < toCluster; i++)
                {
                    docIds[i] = results[i].DocId;
                }

                int k = Convert.ToInt32(tbxKmeans.Text);

                d.Cluster[] clusters;
                if (radKmeans.Checked)
                {
                    clusters = new d.KMeansClustering(index, k, true).GetClusters(docIds, 10);
                }
                else if (radBuckshot.Checked)
                {
                    clusters = new d.BisectingClustering(index, k).GetClusters(docIds, 10);
                }
                else
                {
                    clusters = new d.BisectingClustering(index, k).GetClusters(docIds, 10);
                }


                StringBuilder sb = new StringBuilder();

                sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. " +
                                " Displaying the top {1} documents clustered into at most {2} clusters:</p>", results.Length, toCluster, k);

                for (int i = 0; i < clusters.Length; i++)
                {
                    sb.AppendFormat("<p style='margin-bottom:-10px;font-weight:bold;font-size:11pt;'>Cluster {0}", i + 1);
                    sb.Append("<p>Common terms: ");
                    IDictionaryEnumerator en = clusters[i].CommonTermIds.GetEnumerator();
                    while (en.MoveNext())
                    {
                        sb.AppendFormat("{0} ", index.GetTerm(Convert.ToInt32(en.Key)));
                    }

                    sb.Append("<p>");

                    en = clusters[i].DocIds.GetEnumerator();
                    int count   = 0;
                    int topDocs = 3;
                    while (count < topDocs && en.MoveNext())
                    {
                        short  docId = Convert.ToInt16(en.Key);
                        string url   = "http://" + index.GetURL(docId).Replace("%%", "/");
                        sb.AppendFormat("<p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{0}'>{1}</a>", url, index.GetTitle(docId));
                        sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a>", url);
                        count++;
                    }
                    sb.Append("<p style='border-bottom: solid 1px #999999;'>");
                }
                ltrResults.Text = sb.ToString();
            }
            else
            {
                ltrResults.Text = "There were no results";
            }
        }
Пример #7
0
        //converts hashedlinks into binary using my IDs
        public void convertLinksFile()
        {
            //step 1: load links into hashtable
            d.Index      index     = new d.Index(d.Helper.INDEX_DIRECTORY_NAME);
            FileStream   fs        = new FileStream(d.Helper.SOURCE_DIRECTORY_NAME + "hashedLinks.txt", FileMode.Open);
            StreamReader r         = new StreamReader(fs, System.Text.Encoding.ASCII);
            Hashtable    linksHash = new Hashtable();
            string       line;

            while ((line = r.ReadLine()) != null)
            {
                int      endOfUrl      = line.IndexOf("-->[");
                int      startOfLinks  = endOfUrl + 4;
                string   url           = line.Substring(7, endOfUrl - 7);
                int      lengthOfLinks = line.Length - 7 - url.Length - 4 - 1;
                string   links         = line.Substring(startOfLinks, lengthOfLinks);
                char[]   del           = { ',' };
                string[] linkUrls      = links.Split(del);
                short    docId         = index.GetDocId(url);
                if (docId == -1)
                {
                    continue;
                }

                for (int i = 0; i < linkUrls.Length; i++)
                {
                    string link = linkUrls[i].Trim();
                    if (link.Length > 0)
                    {
                        short linkId = index.GetDocId(link);
                        if (linkId == -1)
                        {
                            Console.WriteLine("linkId error: " + link);
                        }
                        DocLink dl = new DocLink(docId, linkId);
                        linksHash.Add(dl, true);
                    }
                }
            }
            r.Close();
            fs.Close();

            //step 2: load into array and sort them
            DocLink[]             docLinkArray = new DocLink[linksHash.Count];
            IDictionaryEnumerator en           = linksHash.GetEnumerator();
            int cursor = 0;

            while (en.MoveNext())
            {
                docLinkArray[cursor++] = (DocLink)en.Key;
            }

            Array.Sort(docLinkArray);


            //step 3: write out to file
            string path = d.Helper.INDEX_DIRECTORY_NAME + d.Helper.INDEX_DOCLINKS_FILE;

            if (File.Exists(path))
            {
                File.Delete(path);
            }
            string textpath = path + ".txt";

            if (File.Exists(textpath))
            {
                File.Delete(textpath);
            }


            FileStream   fs1    = new FileStream(path, FileMode.CreateNew);
            FileStream   fsText = new FileStream(textpath, FileMode.CreateNew);
            BinaryWriter w      = new BinaryWriter(fs1);
            StreamWriter wText  = new StreamWriter(fsText);

            int counter = 0;

            for (int i = 0; i < docLinkArray.Length; i++)
            {
                DocLink dl = docLinkArray[i];
                if (dl == null)
                {
                    Console.WriteLine("null");
                }
                w.Write(dl.fromId);
                w.Write(dl.toId);
                wText.WriteLine(dl.fromId + " " + dl.toId);
                counter++;
            }
            w.Close();
            wText.Close();
            fs1.Close();
            fsText.Close();

            Console.WriteLine("total links=" + counter);
        }
Пример #8
0
 public ClusteringTest(string path)
 {
     Console.WriteLine("Loading index...");
     index = new d.Index(path);
 }
Пример #9
0
 public SearchTest(string indexPath)
 {
     Console.WriteLine("Loading index...");
     index = new d.Index(indexPath);
 }