private void createTermDocs() { string path = targetDirectoryPath + d.Helper.INDEX_TERMDOCS_FILE; if (File.Exists(path)) { File.Delete(path); } string pathText = targetDirectoryPath + d.Helper.DATATXT_DIRECTORY_NAME + "\\" + d.Helper.INDEX_TERMDOCS_FILE + ".txt"; if (File.Exists(pathText)) { File.Delete(pathText); } FileStream fs = new FileStream(path, FileMode.CreateNew); BinaryWriter w = new BinaryWriter(fs); StreamWriter wText = new StreamWriter(pathText, false, System.Text.Encoding.ASCII); d.TermDocItem[] termDocs; float termWeight; short termCount; short docId; for (int termId = 0; termId < dataLoader.NumberOfTerms; termId++) { termDocs = dataLoader.GetTermDocItems(termId); for (int j = 0; j < termDocs.Length; j++) { docId = termDocs[j].DocId; termWeight = Convert.ToSingle(termDocs[j].TermCount * idf[termId]); termCount = termDocs[j].TermCount; w.Write(termId); w.Write(docId); w.Write(termWeight); w.Write(termCount); //# of these terms in this doc wText.WriteLine(termId + " " + docId + " " + termWeight + " " + termCount); } } w.Close(); wText.Close(); fs.Close(); Console.WriteLine("created " + d.Helper.INDEX_TERMDOCS_FILE); }
public void Load() { double[] temp = new double[dataLoader.NumberOfDocs]; //accumulate doubles for convenience int numOfTerms = dataLoader.NumberOfTerms; int numOfDocs = dataLoader.NumberOfDocs; short[] termDocFrequences = dataLoader.TermDocCounts; short tdf; //# of docs this term occurs in short tf; //of of occurances of a term in a doc int docId; float idf; int k; d.TermDocItem[] termDocs; for (int termId = 0; termId < numOfTerms; termId++) //loop through all terms { tdf = termDocFrequences[termId]; termDocs = dataLoader.GetTermDocItems(termId); k = termDocs.Length; for (int j = 0; j < k; j++) { docId = termDocs[j].DocId; tf = termDocs[j].TermCount; idf = idfs[termId]; temp[docId] += Math.Pow(tf * idf, 2); } } norms = new float[dataLoader.NumberOfDocs]; for (int i = 0; i < norms.Length; i++) { norms[i] = Convert.ToSingle(Math.Sqrt(temp[i])); } }