Пример #1
0
        private static void BuildPostingsFile(string folder, NaiveInvertedIndex index, string[] dictionary, long[] vocabPositions)
        {
            // now build the postings file.
            FileStream postingsFile = new FileStream(Path.Combine(folder, "postings.bin"), FileMode.Create);
            // simultaneously build the vocabulary table on disk, mapping a term index to a
            // file location in the postings file.
            FileStream vocabTable = new FileStream(Path.Combine(folder, "vocabTable.bin"), FileMode.Create);

            // the first thing we must write to the vocabTable file is the number of vocab terms.
            byte[] tSize = BitConverter.GetBytes(dictionary.Length);
            if (BitConverter.IsLittleEndian)
                Array.Reverse(tSize);
            vocabTable.Write(tSize, 0, tSize.Length);

            int vocabI = 0;
            foreach (string s in dictionary) {
                // for each string in dictionary, retrieve its postings.
                var postings = index.GetPostings(s);

                // write the vocab table entry for this term: the byte location of the term in the vocab list file,
                // and the byte location of the postings for the term in the postings file.
                byte[] vPositionBytes = BitConverter.GetBytes(vocabPositions[vocabI]);
                if (BitConverter.IsLittleEndian)
                    Array.Reverse(vPositionBytes);
                vocabTable.Write(vPositionBytes, 0, vPositionBytes.Length);

                byte[] pPositionBytes = BitConverter.GetBytes(postingsFile.Position);
                if (BitConverter.IsLittleEndian)
                    Array.Reverse(pPositionBytes);
                vocabTable.Write(pPositionBytes, 0, pPositionBytes.Length);

                // write the postings file for this term. first, the document frequency for the term, then
                // the document IDs, encoded as gaps.
                byte[] docFreqBytes = BitConverter.GetBytes(postings.Count);
                if (BitConverter.IsLittleEndian)
                    Array.Reverse(docFreqBytes);
                postingsFile.Write(docFreqBytes, 0, docFreqBytes.Length);

                int lastDocId = 0;
                foreach (int docId in postings) {
                    byte[] docIdBytes = BitConverter.GetBytes(docId - lastDocId); // encode a gap, not a doc ID
                    if (BitConverter.IsLittleEndian)
                        Array.Reverse(docIdBytes);
                    postingsFile.Write(docIdBytes, 0, docIdBytes.Length);
                    lastDocId = docId;
                }

                vocabI++;
            }
            vocabTable.Close();
            postingsFile.Close();
        }
Пример #2
0
        private static void BuildIndexForDirectory(string folder)
        {
            NaiveInvertedIndex index = new NaiveInvertedIndex();

            // Index the directory using a naive index
            IndexFiles(folder, index);

            // at this point, "index" contains the in-memory inverted index
            // now we save the index to disk, building three files: the postings index,
            // the vocabulary list, and the vocabulary table.

            // the array of terms
            string[] dictionary = index.GetDictionary();
            // an array of positions in the vocabulary file
            long[] vocabPositions = new long[dictionary.Length];

            BuildVocabFile(folder, dictionary, vocabPositions);
            BuildPostingsFile(folder, index, dictionary, vocabPositions);
        }
Пример #3
0
        private static void IndexFiles(string folder, NaiveInvertedIndex index)
        {
            int documentID = 0;

            Console.WriteLine("Indexing " + Path.Combine(Environment.CurrentDirectory, folder));
            foreach (string fileName in Directory.EnumerateFiles(
                Path.Combine(Environment.CurrentDirectory, folder))) {
                if (fileName.EndsWith(".txt")) {
                    IndexFile(fileName, index, documentID);
                    documentID++;
                }
            }
        }
Пример #4
0
        private static void IndexFile(string fileName, NaiveInvertedIndex index,
		 int documentID)
        {
            try {
                SimpleTokenStream stream = new SimpleTokenStream(fileName);
                while (stream.HasNextToken) {
                    string term = stream.NextToken();
                    var stemmed = PorterStemmer.ProcessToken(term);

                    if (!string.IsNullOrEmpty(stemmed))
                        index.AddTerm(stemmed, documentID);

                }
            }
            catch (Exception ex) {
                Console.WriteLine(ex);
            }
        }