예제 #1
0
        private void loadIds(short[] docs, Index index)
        {
            int estimatedHashSize = docs.Length; //for performance only

            termDocCounts = new Hashtable(estimatedHashSize);
            docIds        = new Hashtable(estimatedHashSize);
            termIds       = new Hashtable(estimatedHashSize);
            int docCursor  = 0;
            int termCursor = 0;

            DocTermItem[] docTerms;
            foreach (short docId in docs)
            {
                docIds.Add(docId, docCursor++);
                docTerms = index.DocTerms(docId);
                foreach (DocTermItem dt in docTerms)
                {
                    int termId = dt.TermId;
                    if (termIds[termId] == null)
                    {
                        termIds.Add(termId, termCursor++);
                    }

                    if (termDocCounts[termId] == null)
                    {
                        termDocCounts[termId] = 1;
                    }
                    else
                    {
                        termDocCounts[termId] = Convert.ToInt32(termDocCounts[termId]) + 1;
                    }
                }
            }
        }
예제 #2
0
        private void loadMatrix(short[] docs, Index index)
        {
            matrix = new float[docIds.Count, termIds.Count];

            DocTermItem[] docTerms;
            float         idf;
            int           termId;
            short         termCount;
            double        docCount; //double required for correct division

            foreach (short docId in docs)
            {
                docTerms = index.DocTerms(docId);
                foreach (DocTermItem dt in docTerms)
                {
                    termId    = dt.TermId;
                    termCount = dt.TermCount;
                    docCount  = Convert.ToDouble(termDocCounts[termId]);
                    idf       = Convert.ToSingle(Math.Log((double)docs.Length / docCount, 2));
                    matrix[docPosition(docId), termPosition(termId)] = idf * (float)termCount;
                }
            }
        }