예제 #1
0
        // Get the lengths of the document vectors
        protected void GetVectorLength()
        {
            SourceBuffer srcBuffer = new SourceBuffer(Disk.DISKBLOCK_SIZE, lenPath);

            while (!srcBuffer.IsCompletelyRead())
            {
                length.Add(srcBuffer.GetInt(), srcBuffer.GetDouble());
            }
        }
예제 #2
0
 // Get the lengths of the document vectors
 protected void GetVectorLength()
 {
     SourceBuffer srcBuffer = new SourceBuffer(Disk.DISKBLOCK_SIZE, lenPath);
     while (!srcBuffer.IsCompletelyRead())
     {
         length.Add(srcBuffer.GetInt(), srcBuffer.GetDouble());
     }
 }
예제 #3
0
        // Merge all blocks in folder SPIMI
        private void MergeBlocks(string inFileName, int blockCount, string outFileName, int docCount)
        {
            /* Prepare neccessary variables */
            // Source buffers
            SourceBuffer[] srcBuffer = new SourceBuffer[blockCount];
            // Destination buffer for term-postings list
            DestinationBuffer desBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + ".txt");
            // Destination buffer for termId-term mapping
            DestinationBuffer dictBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_map.txt");
            int offset = 0;
            // Store the current term in each source buffer
            string[] terms = new string[blockCount];
            // Store the current doc frequency in each source buffer
            int[] df = new int[blockCount];
            // Length of all document vectors
            Dictionary<int, double> length = new Dictionary<int, double>(docCount);

            for (int i = 0; i < blockCount; ++i)
            {
                // Initialize each source buffer
                srcBuffer[i] = new SourceBuffer(Disk.DISKBLOCK_SIZE, "..//..//SPIMI//" + inFileName + i.ToString() + ".txt");
                // Initialize the current term in each buffer
                terms[i] = "";
            }

            /* Starting merging */
            while (true)
            {
                string minTerm = "";
                bool allPortionsConsumed = true;
                // Iterate through all source buffers to find the smallest term
                for (int i = 0; i < blockCount; ++i)
                {
                    // If the current block is completely consumed, ignore this buffer
                    if (srcBuffer[i].IsCompletelyRead())
                        continue;
                    // This block is not completely consumed
                    allPortionsConsumed = false;
                    // Get the current term
                    if (terms[i].Equals(""))
                    {
                        int len = srcBuffer[i].GetInt();
                        terms[i] = srcBuffer[i].GetString(len);
                    }
                    // Find lexicographically smallest term
                    if (minTerm.Equals("") ||
                        minTerm.CompareTo(terms[i]) > 0)
                        minTerm = terms[i];
                }
                // Stop if all portions have been merged
                if (allPortionsConsumed)
                {
                    // Write the remaining data on destination buffer to disk
                    desBuffer.WriteToDiskOnRequest();
                    break;
                }
                /* Store the smallest term's postings list to destination buffer */
                // Store the term
                desBuffer.Store(BitConverter.GetBytes(minTerm.Length));
                for (int i = 0; i < minTerm.Length; ++i)
                    desBuffer.Store(BitConverter.GetBytes(minTerm[i]));
                // Store the offset
                dictBuffer.Store(BitConverter.GetBytes(offset));
                offset += 4 + (minTerm.Length << 1);
                // Store doc frequency
                int docFrequency = 0;
                for (int i = 0; i < blockCount; ++i)
                {
                    if (terms[i] == minTerm)
                    {
                        df[i] = srcBuffer[i].GetInt();
                        docFrequency += df[i];
                    }
                }
                offset += 4 + (docFrequency << 2);
                desBuffer.Store(BitConverter.GetBytes(docFrequency));
                // Store postings list
                for (int i = 0; i < blockCount; ++i)
                {
                    if (terms[i] == minTerm)
                    {
                        double idf = 1 + Math.Log10(docCount / (df[i] >> 1));
                        for (int j = 0; j < (df[i] >> 1); ++j)
                        {
                            int docId = srcBuffer[i].GetInt();
                            int f = srcBuffer[i].GetInt();
                            double tf_idf = (1 + Math.Log10(f))*idf;
                            if (!length.ContainsKey(docId))
                                length.Add(docId, 0);
                            length[docId] += tf_idf*tf_idf;
                            desBuffer.Store(BitConverter.GetBytes(docId));
                            desBuffer.Store(BitConverter.GetBytes(f));
                        }
                        terms[i] = "";
                    }
                }
            }

            dictBuffer.WriteToDiskOnRequest();

            // Write the lengths of all document vectors to disk
            DestinationBuffer lenBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_length.txt");
            foreach (KeyValuePair<int, double> docLen in length)
            {
                lenBuffer.Store(BitConverter.GetBytes(docLen.Key));
                lenBuffer.Store(BitConverter.GetBytes(docLen.Value));
            }
            lenBuffer.WriteToDiskOnRequest();

            return;
        }
예제 #4
0
        // Merge all blocks in folder SPIMI
        private void MergeBlocks(string inFileName, int blockCount, string outFileName, int docCount)
        {
            /* Prepare neccessary variables */
            // Source buffers
            SourceBuffer[] srcBuffer = new SourceBuffer[blockCount];
            // Destination buffer for term-postings list
            DestinationBuffer desBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + ".txt");
            // Destination buffer for termId-term mapping
            DestinationBuffer dictBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_map.txt");
            int offset = 0;

            // Store the current term in each source buffer
            string[] terms = new string[blockCount];
            // Store the current doc frequency in each source buffer
            int[] df = new int[blockCount];
            // Length of all document vectors
            Dictionary <int, double> length = new Dictionary <int, double>(docCount);

            for (int i = 0; i < blockCount; ++i)
            {
                // Initialize each source buffer
                srcBuffer[i] = new SourceBuffer(Disk.DISKBLOCK_SIZE, "..//..//SPIMI//" + inFileName + i.ToString() + ".txt");
                // Initialize the current term in each buffer
                terms[i] = "";
            }

            /* Starting merging */
            while (true)
            {
                string minTerm             = "";
                bool   allPortionsConsumed = true;
                // Iterate through all source buffers to find the smallest term
                for (int i = 0; i < blockCount; ++i)
                {
                    // If the current block is completely consumed, ignore this buffer
                    if (srcBuffer[i].IsCompletelyRead())
                    {
                        continue;
                    }
                    // This block is not completely consumed
                    allPortionsConsumed = false;
                    // Get the current term
                    if (terms[i].Equals(""))
                    {
                        int len = srcBuffer[i].GetInt();
                        terms[i] = srcBuffer[i].GetString(len);
                    }
                    // Find lexicographically smallest term
                    if (minTerm.Equals("") ||
                        minTerm.CompareTo(terms[i]) > 0)
                    {
                        minTerm = terms[i];
                    }
                }
                // Stop if all portions have been merged
                if (allPortionsConsumed)
                {
                    // Write the remaining data on destination buffer to disk
                    desBuffer.WriteToDiskOnRequest();
                    break;
                }
                /* Store the smallest term's postings list to destination buffer */
                // Store the term
                desBuffer.Store(BitConverter.GetBytes(minTerm.Length));
                for (int i = 0; i < minTerm.Length; ++i)
                {
                    desBuffer.Store(BitConverter.GetBytes(minTerm[i]));
                }
                // Store the offset
                dictBuffer.Store(BitConverter.GetBytes(offset));
                offset += 4 + (minTerm.Length << 1);
                // Store doc frequency
                int docFrequency = 0;
                for (int i = 0; i < blockCount; ++i)
                {
                    if (terms[i] == minTerm)
                    {
                        df[i]         = srcBuffer[i].GetInt();
                        docFrequency += df[i];
                    }
                }
                offset += 4 + (docFrequency << 2);
                desBuffer.Store(BitConverter.GetBytes(docFrequency));
                // Store postings list
                for (int i = 0; i < blockCount; ++i)
                {
                    if (terms[i] == minTerm)
                    {
                        double idf = 1 + Math.Log10(docCount / (df[i] >> 1));
                        for (int j = 0; j < (df[i] >> 1); ++j)
                        {
                            int    docId  = srcBuffer[i].GetInt();
                            int    f      = srcBuffer[i].GetInt();
                            double tf_idf = (1 + Math.Log10(f)) * idf;
                            if (!length.ContainsKey(docId))
                            {
                                length.Add(docId, 0);
                            }
                            length[docId] += tf_idf * tf_idf;
                            desBuffer.Store(BitConverter.GetBytes(docId));
                            desBuffer.Store(BitConverter.GetBytes(f));
                        }
                        terms[i] = "";
                    }
                }
            }

            dictBuffer.WriteToDiskOnRequest();

            // Write the lengths of all document vectors to disk
            DestinationBuffer lenBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_length.txt");

            foreach (KeyValuePair <int, double> docLen in length)
            {
                lenBuffer.Store(BitConverter.GetBytes(docLen.Key));
                lenBuffer.Store(BitConverter.GetBytes(docLen.Value));
            }
            lenBuffer.WriteToDiskOnRequest();

            return;
        }