Exemplo n.º 1
0
        private static void BuildIndexForDirectory(string folder, MainWindow window)
        {
            // The inverted index.
            var index = new PositionalInvertedIndex();

            //Get the number of documents
            _numberOfDocuments = Directory.EnumerateFiles(folder, "*.txt").Count();

            //Initiate the progress Bar
            window.InitiateprogressBar(_numberOfDocuments);

            // Index the directory using a naive index
            IndexFiles(folder, index, window);
            foreach (var subfolder in Directory.EnumerateDirectories(folder))
            {
                IndexFiles(subfolder, index, window);
            }

            //Hide the progress bar to allow the user to start searching for terms
            window.HideProgressBar();

            index.ComputeStatistics();
            index.StatToDisk(folder);

            // at this point, "index" contains the in-memory inverted index
            // now we save the index to disk, building three files: the postings index,
            // the vocabulary list, and the vocabulary table.

            // the array of terms
            string[] dictionary = index.GetDictionary();
            // an array of positions in the vocabulary file
            long[] vocabPositions = new long[dictionary.Length];

            BuildVocabFile(folder, dictionary, vocabPositions);
            BuildPostingsFile(folder, index, dictionary, vocabPositions);
        }
Exemplo n.º 2
0
 public void BuildIndex(MainWindow mainWindow)
 {
     BuildIndexForDirectory(_mPath, mainWindow);
 }
Exemplo n.º 3
0
        private static void IndexFiles(string folder, PositionalInvertedIndex index, MainWindow window)
        {
            var documentId = 0;
            FileStream writer = new FileStream(Path.Combine(folder, "docWeights.bin"), FileMode.Create);

            foreach (string fileName in Directory.EnumerateFiles(Path.Combine(Environment.CurrentDirectory,
                folder)))
            {
                if (fileName.EndsWith(".txt"))
                {
                    var termToOccurence = IndexFile(fileName, index, documentId);

                    // Calculate document weight.
                    // Compute all wdts.
                    var wdts = new List<double>();
                    foreach (var pair in termToOccurence)
                    {
                        var wdt = 1.0 + Math.Log(pair.Value);
                        QueryReformulation.AddWeightToMatrix(pair.Key, pair.Value, documentId, _numberOfDocuments);
                        wdts.Add(wdt);
                    }

                    // Calculate ld for this document.
                    double sumTemp = 0.0;
                    foreach (var wdt in wdts)
                        sumTemp += wdt*wdt;
                    double ld = Math.Sqrt(sumTemp);

                    // Write ld in docWeights.bin.
                    var buffer = BitConverter.GetBytes(ld);
                    if (BitConverter.IsLittleEndian)
                        Array.Reverse(buffer);
                    writer.Write(buffer, 0, buffer.Length);

                    documentId++;
                }

                window.IncrementProgressBar();
            }

            // Compute the coocurence Matrix and put it to disk.
            QueryReformulation.CreateMatrix(_numberOfDocuments);
            QueryReformulation.ToDisk(folder);

            writer.Close();
        }