private void loadIds(short[] docs, Index index) { int estimatedHashSize = docs.Length; //for performance only termDocCounts = new Hashtable(estimatedHashSize); docIds = new Hashtable(estimatedHashSize); termIds = new Hashtable(estimatedHashSize); int docCursor = 0; int termCursor = 0; DocTermItem[] docTerms; foreach (short docId in docs) { docIds.Add(docId, docCursor++); docTerms = index.DocTerms(docId); foreach (DocTermItem dt in docTerms) { int termId = dt.TermId; if (termIds[termId] == null) { termIds.Add(termId, termCursor++); } if (termDocCounts[termId] == null) { termDocCounts[termId] = 1; } else { termDocCounts[termId] = Convert.ToInt32(termDocCounts[termId]) + 1; } } } }
public ResultDocument[] Search(int docId, int topResults) { //make a hashtable for terms in the doc which is being compared to all the rest Hashtable termIds = new Hashtable(); DocTermItem[] docTerms = index.DocTerms(docId); foreach (DocTermItem dti in docTerms) { termIds.Add(dti.TermId, dti.TermCount); } ResultDocument[] allResults = new ResultDocument[TRAININGSET]; float docNorm = index.GetDocNorm(docId); float doc2norm; float similarity; for (int doc2id = 0; doc2id < allResults.Length; doc2id++) { doc2norm = index.GetDocNorm(doc2id); similarity = getDotProduct(docId, doc2id, termIds) / (docNorm * doc2norm); allResults[doc2id] = new ResultDocument(doc2id, similarity); } Array.Sort(allResults); ResultDocument[] results = new ResultDocument[topResults]; int j = 0; for (int i = 0; i < topResults; i++) { if (allResults[j].DocId == docId) //do not return the doc itself! { j++; } results[i] = allResults[j++]; } return(results); }
private void loadMatrix(short[] docs, Index index) { matrix = new float[docIds.Count, termIds.Count]; DocTermItem[] docTerms; float idf; int termId; short termCount; double docCount; //double required for correct division foreach (short docId in docs) { docTerms = index.DocTerms(docId); foreach (DocTermItem dt in docTerms) { termId = dt.TermId; termCount = dt.TermCount; docCount = Convert.ToDouble(termDocCounts[termId]); idf = Convert.ToSingle(Math.Log((double)docs.Length / docCount, 2)); matrix[docPosition(docId), termPosition(termId)] = idf * (float)termCount; } } }
static void Main(string[] args) { AggregateTester at = new AggregateTester(0, 10, 1, 10, 100, 10, 5, 10, 1); at.Run(); Hashtable result = new Hashtable(); result.Add(1, true); result.Add(2, true); result.Add(3, true); Hashtable relevant = new Hashtable(); relevant.Add(1, true); relevant.Add(3, true); relevant.Add(5, true); relevant.Add(7, true); relevant.Add(8, true); d.PerformanceCalculator pc = new d.PerformanceCalculator(result, relevant); Console.WriteLine("Precision = " + pc.Precision); Console.WriteLine("Recall = " + pc.Recall); Console.WriteLine("FMeasure = " + pc.FMeasure); d.DocsLoader dl = new d.DocsLoader(); d.CatsLoader cl = new d.CatsLoader(); d.DocCatsLoader dc = new d.DocCatsLoader(cl); int docId = 1; ArrayList al = dc.GetDocCategories(docId); Console.WriteLine(dl.GetDocTitle(docId) + " has " + al.Count + " categories: "); foreach (int catId in al) { Console.WriteLine(" " + cl.GetCategory(catId)); } d.Index index = new d.Index(Helper.INDEX_PATH); d.DocTermItem[] dterms = index.DocTerms(0); SearchVS s = new SearchVS(Helper.INDEX_PATH); s.run(); i.DataLoader dal = new i.DataLoader(Helper.SOURCE_PATH); i.IndexBuilder ib = new i.IndexBuilder(dal, Helper.INDEX_PATH); ib.BuildIndex(); PorterStemmerAlgorithm.PorterStemmer ps = new PorterStemmerAlgorithm.PorterStemmer(); Console.WriteLine(ps.stemTerm("beautify")); TermFilter f = new TermFilter(); f.CreateNewTermsFile(); TermProcessor p = new TermProcessor(); p.CreateTermsFile(); TermDocsProcessor tdp = new TermDocsProcessor(); tdp.CreateTermDocsFile(); tdp.CreateTermDocsFile(); }