예제 #1
0
        private void loadIds(short[] docs, Index index)
        {
            int estimatedHashSize = docs.Length; //for performance only

            termDocCounts = new Hashtable(estimatedHashSize);
            docIds        = new Hashtable(estimatedHashSize);
            termIds       = new Hashtable(estimatedHashSize);
            int docCursor  = 0;
            int termCursor = 0;

            DocTermItem[] docTerms;
            foreach (short docId in docs)
            {
                docIds.Add(docId, docCursor++);
                docTerms = index.DocTerms(docId);
                foreach (DocTermItem dt in docTerms)
                {
                    int termId = dt.TermId;
                    if (termIds[termId] == null)
                    {
                        termIds.Add(termId, termCursor++);
                    }

                    if (termDocCounts[termId] == null)
                    {
                        termDocCounts[termId] = 1;
                    }
                    else
                    {
                        termDocCounts[termId] = Convert.ToInt32(termDocCounts[termId]) + 1;
                    }
                }
            }
        }
예제 #2
0
        public ResultDocument[] Search(int docId, int topResults)
        {
            //make a hashtable for terms in the doc which is being compared to all the rest
            Hashtable termIds = new Hashtable();

            DocTermItem[] docTerms = index.DocTerms(docId);
            foreach (DocTermItem dti in docTerms)
            {
                termIds.Add(dti.TermId, dti.TermCount);
            }

            ResultDocument[] allResults = new ResultDocument[TRAININGSET];
            float            docNorm    = index.GetDocNorm(docId);
            float            doc2norm;
            float            similarity;

            for (int doc2id = 0; doc2id < allResults.Length; doc2id++)
            {
                doc2norm           = index.GetDocNorm(doc2id);
                similarity         = getDotProduct(docId, doc2id, termIds) / (docNorm * doc2norm);
                allResults[doc2id] = new ResultDocument(doc2id, similarity);
            }

            Array.Sort(allResults);

            ResultDocument[] results = new ResultDocument[topResults];
            int j = 0;

            for (int i = 0; i < topResults; i++)
            {
                if (allResults[j].DocId == docId) //do not return the doc itself!
                {
                    j++;
                }

                results[i] = allResults[j++];
            }

            return(results);
        }
예제 #3
0
        private void loadMatrix(short[] docs, Index index)
        {
            matrix = new float[docIds.Count, termIds.Count];

            DocTermItem[] docTerms;
            float         idf;
            int           termId;
            short         termCount;
            double        docCount; //double required for correct division

            foreach (short docId in docs)
            {
                docTerms = index.DocTerms(docId);
                foreach (DocTermItem dt in docTerms)
                {
                    termId    = dt.TermId;
                    termCount = dt.TermCount;
                    docCount  = Convert.ToDouble(termDocCounts[termId]);
                    idf       = Convert.ToSingle(Math.Log((double)docs.Length / docCount, 2));
                    matrix[docPosition(docId), termPosition(termId)] = idf * (float)termCount;
                }
            }
        }
예제 #4
0
파일: MainApp.cs 프로젝트: ic4f/oldcode
        static void Main(string[] args)
        {
            AggregateTester at = new AggregateTester(0, 10, 1, 10, 100, 10, 5, 10, 1);

            at.Run();

            Hashtable result = new Hashtable();

            result.Add(1, true);
            result.Add(2, true);
            result.Add(3, true);

            Hashtable relevant = new Hashtable();

            relevant.Add(1, true);
            relevant.Add(3, true);
            relevant.Add(5, true);
            relevant.Add(7, true);
            relevant.Add(8, true);

            d.PerformanceCalculator pc = new d.PerformanceCalculator(result, relevant);
            Console.WriteLine("Precision = " + pc.Precision);
            Console.WriteLine("Recall = " + pc.Recall);
            Console.WriteLine("FMeasure = " + pc.FMeasure);

            d.DocsLoader    dl    = new d.DocsLoader();
            d.CatsLoader    cl    = new d.CatsLoader();
            d.DocCatsLoader dc    = new d.DocCatsLoader(cl);
            int             docId = 1;
            ArrayList       al    = dc.GetDocCategories(docId);

            Console.WriteLine(dl.GetDocTitle(docId) + " has " + al.Count + " categories: ");
            foreach (int catId in al)
            {
                Console.WriteLine("  " + cl.GetCategory(catId));
            }


            d.Index         index  = new d.Index(Helper.INDEX_PATH);
            d.DocTermItem[] dterms = index.DocTerms(0);

            SearchVS s = new SearchVS(Helper.INDEX_PATH);

            s.run();

            i.DataLoader   dal = new i.DataLoader(Helper.SOURCE_PATH);
            i.IndexBuilder ib  = new i.IndexBuilder(dal, Helper.INDEX_PATH);
            ib.BuildIndex();


            PorterStemmerAlgorithm.PorterStemmer ps = new PorterStemmerAlgorithm.PorterStemmer();
            Console.WriteLine(ps.stemTerm("beautify"));

            TermFilter f = new TermFilter();

            f.CreateNewTermsFile();

            TermProcessor p = new TermProcessor();

            p.CreateTermsFile();

            TermDocsProcessor tdp = new TermDocsProcessor();

            tdp.CreateTermDocsFile();
            tdp.CreateTermDocsFile();
        }