public void GetNewTerms() { d.Index index = new d.Index(d.Helper.INDEX_DIRECTORY_NAME); Hashtable terms = index.TermIds; string path = d.Helper.SOURCE_DIRECTORY_NAME + "myterms.txt"; FileStream fs = new FileStream(path, FileMode.Open); StreamReader r = new StreamReader(fs, System.Text.Encoding.ASCII); string line; string term; Hashtable newTerms = new Hashtable(); while ((line = r.ReadLine()) != null) { term = line.Substring(0, line.IndexOf(' ')); if (!terms.Contains(term)) { newTerms.Add(term, true); } } r.Close(); fs.Close(); Console.WriteLine("_________________new terms = " + newTerms.Count); IDictionaryEnumerator en = newTerms.GetEnumerator(); while (en.MoveNext()) { Console.WriteLine(en.Key); } }
private void execSearch() { float w = Convert.ToSingle(tbxPagerank.Text); d.Index index = new d.Index(getIndexDir()); d.VSSearcher searcher = new d.VSSearcher(index, w); d.ResultDocument[] results = searcher.Search(tbxQuery.Text.Trim()); if (results != null && results.Length > 0) { StringBuilder sb = new StringBuilder(); sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. Displaying the top {1}:</p>", results.Length, tbxDisplay.Text); sb.Append("<table style='font-family:Verdana;font-size:10pt;' cellpadding='3' cellspacing='2' width='100%'>"); sb.Append("<tr bgcolor='#f1f1f1'><td>#</td><td>similarity</td><td>pagerank</td><td>document</td></tr>"); int max = Math.Min(results.Length, Convert.ToInt32(tbxDisplay.Text)); for (int i = 0; i < max; i++) { int docId = results[i].DocId; string url = "http://" + index.GetURL(docId).Replace("%%", "/"); sb.AppendFormat("<tr><td>{4}</td><td>{0}</td><td>{1}</td><td><p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{2}'>{3}</a>", results[i].Similarity, results[i].PageRank, url, index.GetTitle(docId), i + 1); sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a></td></tr>", url); } sb.Append("</table>"); ltrResults.Text = sb.ToString(); } else { ltrResults.Text = "There were no results"; } }
public void SetUp() { string sourcePath = @"C:\_current\development\data\isr\testindex\source\"; string indexPath = @"C:\_current\development\data\isr\testindex\index\"; i.DataLoader dataLoader = new i.DataLoader(sourcePath); i.IndexBuilder indexBuilder = new i.IndexBuilder(dataLoader, indexPath); indexBuilder.BuildIndex(); index = new d.Index(indexPath); }
public void GetMissedTerms() { string path = d.Helper.SOURCE_DIRECTORY_NAME + "myterms.txt"; FileStream fs = new FileStream(path, FileMode.Open); StreamReader r = new StreamReader(fs, System.Text.Encoding.ASCII); string line; Hashtable myterms = new Hashtable(); while ((line = r.ReadLine()) != null) { myterms.Add(line.Substring(0, line.IndexOf(' ')), true); } r.Close(); fs.Close(); Console.WriteLine("_________________my terms = " + myterms.Count); d.Index index = new d.Index(d.Helper.INDEX_DIRECTORY_NAME); Hashtable terms = index.TermIds; Hashtable missedterms = new Hashtable(); IDictionaryEnumerator en = terms.GetEnumerator(); while (en.MoveNext()) { if (!myterms.Contains(en.Key)) { missedterms.Add(en.Key, true); } } Console.WriteLine("_________________missed terms = " + missedterms.Count); IDictionaryEnumerator en2 = missedterms.GetEnumerator(); while (en2.MoveNext()) { Console.WriteLine(en2.Key); } }
private void execAuthorities() { float w = Convert.ToSingle(tbxPagerank.Text); d.Index index = new d.Index(getIndexDir()); d.VSSearcher searcher = new d.VSSearcher(index, w); d.ResultDocument[] results = searcher.Search(tbxQuery.Text.Trim()); if (results != null && results.Length > 0) { StringBuilder sb = new StringBuilder(); int rootSize = Convert.ToInt32(tbxRoot.Text); int maxParents = Convert.ToInt32(tbxParents.Text); int maxChildren = Convert.ToInt32(tbxChildren.Text); d.AHPageLoader ahl = new d.AHPageLoader(index, results, rootSize, maxParents, maxChildren); d.AHDocument[] authorities = ahl.Authorities; int max = Math.Min(authorities.Length, Convert.ToInt32(tbxDisplay.Text)); sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. " + "Displaying the top {1} authorities:</p>", results.Length, max); sb.Append("<table style='font-family:Verdana;font-size:10pt;' cellpadding='3' cellspacing='2' width='100%'>"); sb.Append("<tr bgcolor='#f1f1f1'><td>authority score</td><td>hub score</td><td>document</td></tr>"); for (int i = 0; i < max; i++) { int docId = authorities[i].DocId; string url = "http://" + index.GetURL(docId).Replace("%%", "/"); sb.AppendFormat("<tr><td>{0}</td><td>{1}</td><td><p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{2}'>{3}</a>", authorities[i].AuthorityScore, authorities[i].HubScore, url, index.GetTitle(docId)); sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a></td></tr>", url); } sb.Append("</table>"); ltrResults.Text = sb.ToString(); } else { ltrResults.Text = "There were no results"; } }
private void execCluster() { float w = Convert.ToSingle(tbxPagerank.Text); d.Index index = new d.Index(getIndexDir()); d.VSSearcher searcher = new d.VSSearcher(index, w); d.ResultDocument[] results = searcher.Search(tbxQuery.Text.Trim()); if (results != null && results.Length > 0) { int toCluster = Convert.ToInt32(tbxDocs.Text); toCluster = Math.Min(toCluster, results.Length); short[] docIds = new short[toCluster]; for (int i = 0; i < toCluster; i++) { docIds[i] = results[i].DocId; } int k = Convert.ToInt32(tbxKmeans.Text); d.Cluster[] clusters; if (radKmeans.Checked) { clusters = new d.KMeansClustering(index, k, true).GetClusters(docIds, 10); } else if (radBuckshot.Checked) { clusters = new d.BisectingClustering(index, k).GetClusters(docIds, 10); } else { clusters = new d.BisectingClustering(index, k).GetClusters(docIds, 10); } StringBuilder sb = new StringBuilder(); sb.AppendFormat("<p style='border-bottom:solid 1px #999999;'>Your search returned <b>{0}</b> results. " + " Displaying the top {1} documents clustered into at most {2} clusters:</p>", results.Length, toCluster, k); for (int i = 0; i < clusters.Length; i++) { sb.AppendFormat("<p style='margin-bottom:-10px;font-weight:bold;font-size:11pt;'>Cluster {0}", i + 1); sb.Append("<p>Common terms: "); IDictionaryEnumerator en = clusters[i].CommonTermIds.GetEnumerator(); while (en.MoveNext()) { sb.AppendFormat("{0} ", index.GetTerm(Convert.ToInt32(en.Key))); } sb.Append("<p>"); en = clusters[i].DocIds.GetEnumerator(); int count = 0; int topDocs = 3; while (count < topDocs && en.MoveNext()) { short docId = Convert.ToInt16(en.Key); string url = "http://" + index.GetURL(docId).Replace("%%", "/"); sb.AppendFormat("<p style='margin-bottom:-10px;'><a style='font-size:11pt;' href='{0}'>{1}</a>", url, index.GetTitle(docId)); sb.AppendFormat("<p><a style='color:green;font-size:9pt' href='{0}'>{0}</a>", url); count++; } sb.Append("<p style='border-bottom: solid 1px #999999;'>"); } ltrResults.Text = sb.ToString(); } else { ltrResults.Text = "There were no results"; } }
//converts hashedlinks into binary using my IDs public void convertLinksFile() { //step 1: load links into hashtable d.Index index = new d.Index(d.Helper.INDEX_DIRECTORY_NAME); FileStream fs = new FileStream(d.Helper.SOURCE_DIRECTORY_NAME + "hashedLinks.txt", FileMode.Open); StreamReader r = new StreamReader(fs, System.Text.Encoding.ASCII); Hashtable linksHash = new Hashtable(); string line; while ((line = r.ReadLine()) != null) { int endOfUrl = line.IndexOf("-->["); int startOfLinks = endOfUrl + 4; string url = line.Substring(7, endOfUrl - 7); int lengthOfLinks = line.Length - 7 - url.Length - 4 - 1; string links = line.Substring(startOfLinks, lengthOfLinks); char[] del = { ',' }; string[] linkUrls = links.Split(del); short docId = index.GetDocId(url); if (docId == -1) { continue; } for (int i = 0; i < linkUrls.Length; i++) { string link = linkUrls[i].Trim(); if (link.Length > 0) { short linkId = index.GetDocId(link); if (linkId == -1) { Console.WriteLine("linkId error: " + link); } DocLink dl = new DocLink(docId, linkId); linksHash.Add(dl, true); } } } r.Close(); fs.Close(); //step 2: load into array and sort them DocLink[] docLinkArray = new DocLink[linksHash.Count]; IDictionaryEnumerator en = linksHash.GetEnumerator(); int cursor = 0; while (en.MoveNext()) { docLinkArray[cursor++] = (DocLink)en.Key; } Array.Sort(docLinkArray); //step 3: write out to file string path = d.Helper.INDEX_DIRECTORY_NAME + d.Helper.INDEX_DOCLINKS_FILE; if (File.Exists(path)) { File.Delete(path); } string textpath = path + ".txt"; if (File.Exists(textpath)) { File.Delete(textpath); } FileStream fs1 = new FileStream(path, FileMode.CreateNew); FileStream fsText = new FileStream(textpath, FileMode.CreateNew); BinaryWriter w = new BinaryWriter(fs1); StreamWriter wText = new StreamWriter(fsText); int counter = 0; for (int i = 0; i < docLinkArray.Length; i++) { DocLink dl = docLinkArray[i]; if (dl == null) { Console.WriteLine("null"); } w.Write(dl.fromId); w.Write(dl.toId); wText.WriteLine(dl.fromId + " " + dl.toId); counter++; } w.Close(); wText.Close(); fs1.Close(); fsText.Close(); Console.WriteLine("total links=" + counter); }
public ClusteringTest(string path) { Console.WriteLine("Loading index..."); index = new d.Index(path); }
public SearchTest(string indexPath) { Console.WriteLine("Loading index..."); index = new d.Index(indexPath); }