コード例 #1
0
        private static void BuildIndexForDirectory(string folder, MainWindow window)
        {
            // The inverted index.
            var index = new PositionalInvertedIndex();

            //Get the number of documents
            _numberOfDocuments = Directory.EnumerateFiles(folder, "*.txt").Count();

            //Initiate the progress Bar
            window.InitiateprogressBar(_numberOfDocuments);

            // Index the directory using a naive index
            IndexFiles(folder, index, window);
            foreach (var subfolder in Directory.EnumerateDirectories(folder))
            {
                IndexFiles(subfolder, index, window);
            }

            //Hide the progress bar to allow the user to start searching for terms
            window.HideProgressBar();

            index.ComputeStatistics();
            index.StatToDisk(folder);

            // at this point, "index" contains the in-memory inverted index
            // now we save the index to disk, building three files: the postings index,
            // the vocabulary list, and the vocabulary table.

            // the array of terms
            string[] dictionary = index.GetDictionary();
            // an array of positions in the vocabulary file
            long[] vocabPositions = new long[dictionary.Length];

            BuildVocabFile(folder, dictionary, vocabPositions);
            BuildPostingsFile(folder, index, dictionary, vocabPositions);
        }
コード例 #2
0
        private static void BuildPostingsFile(string folder, PositionalInvertedIndex index, string[] dictionary, long[] vocabPositions)
        {
            // now build the postings file.
            FileStream postingsFile = new FileStream(Path.Combine(folder, "postings.bin"), FileMode.Create);
            // simultaneously build the vocabulary table on disk, mapping a term index to a
            // file location in the postings file.
            FileStream vocabTable = new FileStream(Path.Combine(folder, "vocabTable.bin"), FileMode.Create);

            // the first thing we must write to the vocabTable file is the number of vocab terms.
            byte[] tSize = BitConverter.GetBytes(dictionary.Length);
            if (BitConverter.IsLittleEndian)
                Array.Reverse(tSize);
            vocabTable.Write(tSize, 0, tSize.Length);

            int vocabI = 0;
            foreach (string s in dictionary)
            {
                // for each string in dictionary, retrieve its postings.
                var postings = index.GetPostings(s);

                // write the vocab table entry for this term: the byte location of the term in the vocab list file,
                // and the byte location of the postings for the term in the postings file.
                byte[] vPositionBytes = BitConverter.GetBytes(vocabPositions[vocabI]);
                if (BitConverter.IsLittleEndian)
                    Array.Reverse(vPositionBytes);
                vocabTable.Write(vPositionBytes, 0, vPositionBytes.Length);

                byte[] pPositionBytes = BitConverter.GetBytes(postingsFile.Position);
                if (BitConverter.IsLittleEndian)
                    Array.Reverse(pPositionBytes);
                vocabTable.Write(pPositionBytes, 0, pPositionBytes.Length);

                // Number of documents
                byte[] docFreqBytes = BitConverter.GetBytes(postings.Count);
                if (BitConverter.IsLittleEndian)
                    Array.Reverse(docFreqBytes);
                postingsFile.Write(docFreqBytes, 0, docFreqBytes.Length);

                //Document IDs as gaps
                int lastDocId = 0;
                foreach (int docId in postings.Keys)
                {
                    byte[] docIdBytes = BitConverter.GetBytes(docId - lastDocId);
                    if (BitConverter.IsLittleEndian)
                        Array.Reverse(docIdBytes);
                    postingsFile.Write(docIdBytes, 0, docIdBytes.Length);
                    lastDocId = docId;

                    //Number of positions
                    byte[] posFreqBytes = BitConverter.GetBytes(postings[docId].Count);
                    if (BitConverter.IsLittleEndian)
                        Array.Reverse(posFreqBytes);
                    postingsFile.Write(posFreqBytes, 0, posFreqBytes.Length);

                    //Positions as gaps
                    int lastPos = 0;
                    foreach (var position in postings[docId])
                    {
                        byte[] posBytes = BitConverter.GetBytes(position - lastPos);
                        if (BitConverter.IsLittleEndian)
                            Array.Reverse(posBytes);
                        postingsFile.Write(posBytes, 0, posBytes.Length);
                        lastPos = position;
                    }
                }

                vocabI++;
            }
            vocabTable.Close();
            postingsFile.Close();
        }
コード例 #3
0
        private static void IndexFiles(string folder, PositionalInvertedIndex index, MainWindow window)
        {
            var documentId = 0;
            FileStream writer = new FileStream(Path.Combine(folder, "docWeights.bin"), FileMode.Create);

            foreach (string fileName in Directory.EnumerateFiles(Path.Combine(Environment.CurrentDirectory,
                folder)))
            {
                if (fileName.EndsWith(".txt"))
                {
                    var termToOccurence = IndexFile(fileName, index, documentId);

                    // Calculate document weight.
                    // Compute all wdts.
                    var wdts = new List<double>();
                    foreach (var pair in termToOccurence)
                    {
                        var wdt = 1.0 + Math.Log(pair.Value);
                        QueryReformulation.AddWeightToMatrix(pair.Key, pair.Value, documentId, _numberOfDocuments);
                        wdts.Add(wdt);
                    }

                    // Calculate ld for this document.
                    double sumTemp = 0.0;
                    foreach (var wdt in wdts)
                        sumTemp += wdt*wdt;
                    double ld = Math.Sqrt(sumTemp);

                    // Write ld in docWeights.bin.
                    var buffer = BitConverter.GetBytes(ld);
                    if (BitConverter.IsLittleEndian)
                        Array.Reverse(buffer);
                    writer.Write(buffer, 0, buffer.Length);

                    documentId++;
                }

                window.IncrementProgressBar();
            }

            // Compute the coocurence Matrix and put it to disk.
            QueryReformulation.CreateMatrix(_numberOfDocuments);
            QueryReformulation.ToDisk(folder);

            writer.Close();
        }
コード例 #4
0
        private static Dictionary<string, int> IndexFile(string fileName, PositionalInvertedIndex index, int documentId)
        {
            var tftds = new Dictionary<string, int>();

            try
            {
                SimpleTokenStream stream = new SimpleTokenStream(fileName);
                var position = 0;

                while (stream.HasNextToken)
                {
                    var token = stream.NextToken();
                    if (token.Replace("-", "") == "") continue;
                    if (token.Contains("-"))
                    {
                        foreach (var tokenHyphen in token.Split('-'))
                        {
                            var termHyphen = PorterStemmer.ProcessToken(tokenHyphen);
                            index.AddTerm(termHyphen, documentId, position);
                            if (tftds.ContainsKey(termHyphen))
                                tftds[termHyphen]++;
                            else
                            {
                                tftds.Add(termHyphen, 1);
                            }
                            KGramIndex.GenerateKgrams(tokenHyphen, true, 0);
                        }
                    }
                    var term = PorterStemmer.ProcessToken(token.Replace("-", ""));
                    index.AddTerm(term, documentId, position);
                    if (tftds.ContainsKey(term))
                        tftds[term]++;
                    else
                    {
                        tftds.Add(term, 1);
                    }
                    KGramIndex.GenerateKgrams(token.Replace("-", ""), true, 0);
                    position++;
                }
                stream.Close();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }

            return tftds;
        }