private static void BuildIndexForDirectory(string folder, MainWindow window) { // The inverted index. var index = new PositionalInvertedIndex(); //Get the number of documents _numberOfDocuments = Directory.EnumerateFiles(folder, "*.txt").Count(); //Initiate the progress Bar window.InitiateprogressBar(_numberOfDocuments); // Index the directory using a naive index IndexFiles(folder, index, window); foreach (var subfolder in Directory.EnumerateDirectories(folder)) { IndexFiles(subfolder, index, window); } //Hide the progress bar to allow the user to start searching for terms window.HideProgressBar(); index.ComputeStatistics(); index.StatToDisk(folder); // at this point, "index" contains the in-memory inverted index // now we save the index to disk, building three files: the postings index, // the vocabulary list, and the vocabulary table. // the array of terms string[] dictionary = index.GetDictionary(); // an array of positions in the vocabulary file long[] vocabPositions = new long[dictionary.Length]; BuildVocabFile(folder, dictionary, vocabPositions); BuildPostingsFile(folder, index, dictionary, vocabPositions); }
public void BuildIndex(MainWindow mainWindow) { BuildIndexForDirectory(_mPath, mainWindow); }
private static void IndexFiles(string folder, PositionalInvertedIndex index, MainWindow window) { var documentId = 0; FileStream writer = new FileStream(Path.Combine(folder, "docWeights.bin"), FileMode.Create); foreach (string fileName in Directory.EnumerateFiles(Path.Combine(Environment.CurrentDirectory, folder))) { if (fileName.EndsWith(".txt")) { var termToOccurence = IndexFile(fileName, index, documentId); // Calculate document weight. // Compute all wdts. var wdts = new List<double>(); foreach (var pair in termToOccurence) { var wdt = 1.0 + Math.Log(pair.Value); QueryReformulation.AddWeightToMatrix(pair.Key, pair.Value, documentId, _numberOfDocuments); wdts.Add(wdt); } // Calculate ld for this document. double sumTemp = 0.0; foreach (var wdt in wdts) sumTemp += wdt*wdt; double ld = Math.Sqrt(sumTemp); // Write ld in docWeights.bin. var buffer = BitConverter.GetBytes(ld); if (BitConverter.IsLittleEndian) Array.Reverse(buffer); writer.Write(buffer, 0, buffer.Length); documentId++; } window.IncrementProgressBar(); } // Compute the coocurence Matrix and put it to disk. QueryReformulation.CreateMatrix(_numberOfDocuments); QueryReformulation.ToDisk(folder); writer.Close(); }