예제 #1
0
        private void createTermDocs()
        {
            string path = targetDirectoryPath + d.Helper.INDEX_TERMDOCS_FILE;

            if (File.Exists(path))
            {
                File.Delete(path);
            }

            string pathText = targetDirectoryPath + d.Helper.DATATXT_DIRECTORY_NAME + "\\" + d.Helper.INDEX_TERMDOCS_FILE + ".txt";

            if (File.Exists(pathText))
            {
                File.Delete(pathText);
            }

            FileStream   fs    = new FileStream(path, FileMode.CreateNew);
            BinaryWriter w     = new BinaryWriter(fs);
            StreamWriter wText = new StreamWriter(pathText, false, System.Text.Encoding.ASCII);

            d.TermDocItem[] termDocs;
            float           termWeight;
            short           termCount;
            short           docId;

            for (int termId = 0; termId < dataLoader.NumberOfTerms; termId++)
            {
                termDocs = dataLoader.GetTermDocItems(termId);
                for (int j = 0; j < termDocs.Length; j++)
                {
                    docId      = termDocs[j].DocId;
                    termWeight = Convert.ToSingle(termDocs[j].TermCount * idf[termId]);
                    termCount  = termDocs[j].TermCount;
                    w.Write(termId);
                    w.Write(docId);
                    w.Write(termWeight);
                    w.Write(termCount); //# of these terms in this doc
                    wText.WriteLine(termId + " " + docId + " " + termWeight + " " + termCount);
                }
            }
            w.Close();
            wText.Close();
            fs.Close();
            Console.WriteLine("created " + d.Helper.INDEX_TERMDOCS_FILE);
        }
예제 #2
0
        public void Load()
        {
            double[] temp = new double[dataLoader.NumberOfDocs]; //accumulate doubles for convenience

            int numOfTerms = dataLoader.NumberOfTerms;
            int numOfDocs  = dataLoader.NumberOfDocs;

            short[] termDocFrequences = dataLoader.TermDocCounts;
            short   tdf; //# of docs this term occurs in
            short   tf;  //of of occurances of a term in a doc
            int     docId;
            float   idf;
            int     k;

            d.TermDocItem[] termDocs;

            for (int termId = 0; termId < numOfTerms; termId++) //loop through all terms
            {
                tdf      = termDocFrequences[termId];
                termDocs = dataLoader.GetTermDocItems(termId);
                k        = termDocs.Length;
                for (int j = 0; j < k; j++)
                {
                    docId        = termDocs[j].DocId;
                    tf           = termDocs[j].TermCount;
                    idf          = idfs[termId];
                    temp[docId] += Math.Pow(tf * idf, 2);
                }
            }

            norms = new float[dataLoader.NumberOfDocs];
            for (int i = 0; i < norms.Length; i++)
            {
                norms[i] = Convert.ToSingle(Math.Sqrt(temp[i]));
            }
        }