// Get the lengths of the document vectors protected void GetVectorLength() { SourceBuffer srcBuffer = new SourceBuffer(Disk.DISKBLOCK_SIZE, lenPath); while (!srcBuffer.IsCompletelyRead()) { length.Add(srcBuffer.GetInt(), srcBuffer.GetDouble()); } }
// Merge all blocks in folder SPIMI private void MergeBlocks(string inFileName, int blockCount, string outFileName, int docCount) { /* Prepare neccessary variables */ // Source buffers SourceBuffer[] srcBuffer = new SourceBuffer[blockCount]; // Destination buffer for term-postings list DestinationBuffer desBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + ".txt"); // Destination buffer for termId-term mapping DestinationBuffer dictBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_map.txt"); int offset = 0; // Store the current term in each source buffer string[] terms = new string[blockCount]; // Store the current doc frequency in each source buffer int[] df = new int[blockCount]; // Length of all document vectors Dictionary<int, double> length = new Dictionary<int, double>(docCount); for (int i = 0; i < blockCount; ++i) { // Initialize each source buffer srcBuffer[i] = new SourceBuffer(Disk.DISKBLOCK_SIZE, "..//..//SPIMI//" + inFileName + i.ToString() + ".txt"); // Initialize the current term in each buffer terms[i] = ""; } /* Starting merging */ while (true) { string minTerm = ""; bool allPortionsConsumed = true; // Iterate through all source buffers to find the smallest term for (int i = 0; i < blockCount; ++i) { // If the current block is completely consumed, ignore this buffer if (srcBuffer[i].IsCompletelyRead()) continue; // This block is not completely consumed allPortionsConsumed = false; // Get the current term if (terms[i].Equals("")) { int len = srcBuffer[i].GetInt(); terms[i] = srcBuffer[i].GetString(len); } // Find lexicographically smallest term if (minTerm.Equals("") || minTerm.CompareTo(terms[i]) > 0) minTerm = terms[i]; } // Stop if all portions have been merged if (allPortionsConsumed) { // Write the remaining data on destination buffer to disk desBuffer.WriteToDiskOnRequest(); break; } /* Store the smallest term's postings list to destination buffer */ // Store the term desBuffer.Store(BitConverter.GetBytes(minTerm.Length)); for (int i = 0; i < minTerm.Length; ++i) desBuffer.Store(BitConverter.GetBytes(minTerm[i])); // Store the offset dictBuffer.Store(BitConverter.GetBytes(offset)); offset += 4 + (minTerm.Length << 1); // Store doc frequency int docFrequency = 0; for (int i = 0; i < blockCount; ++i) { if (terms[i] == minTerm) { df[i] = srcBuffer[i].GetInt(); docFrequency += df[i]; } } offset += 4 + (docFrequency << 2); desBuffer.Store(BitConverter.GetBytes(docFrequency)); // Store postings list for (int i = 0; i < blockCount; ++i) { if (terms[i] == minTerm) { double idf = 1 + Math.Log10(docCount / (df[i] >> 1)); for (int j = 0; j < (df[i] >> 1); ++j) { int docId = srcBuffer[i].GetInt(); int f = srcBuffer[i].GetInt(); double tf_idf = (1 + Math.Log10(f))*idf; if (!length.ContainsKey(docId)) length.Add(docId, 0); length[docId] += tf_idf*tf_idf; desBuffer.Store(BitConverter.GetBytes(docId)); desBuffer.Store(BitConverter.GetBytes(f)); } terms[i] = ""; } } } dictBuffer.WriteToDiskOnRequest(); // Write the lengths of all document vectors to disk DestinationBuffer lenBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_length.txt"); foreach (KeyValuePair<int, double> docLen in length) { lenBuffer.Store(BitConverter.GetBytes(docLen.Key)); lenBuffer.Store(BitConverter.GetBytes(docLen.Value)); } lenBuffer.WriteToDiskOnRequest(); return; }
// Merge all blocks in folder SPIMI private void MergeBlocks(string inFileName, int blockCount, string outFileName, int docCount) { /* Prepare neccessary variables */ // Source buffers SourceBuffer[] srcBuffer = new SourceBuffer[blockCount]; // Destination buffer for term-postings list DestinationBuffer desBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + ".txt"); // Destination buffer for termId-term mapping DestinationBuffer dictBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_map.txt"); int offset = 0; // Store the current term in each source buffer string[] terms = new string[blockCount]; // Store the current doc frequency in each source buffer int[] df = new int[blockCount]; // Length of all document vectors Dictionary <int, double> length = new Dictionary <int, double>(docCount); for (int i = 0; i < blockCount; ++i) { // Initialize each source buffer srcBuffer[i] = new SourceBuffer(Disk.DISKBLOCK_SIZE, "..//..//SPIMI//" + inFileName + i.ToString() + ".txt"); // Initialize the current term in each buffer terms[i] = ""; } /* Starting merging */ while (true) { string minTerm = ""; bool allPortionsConsumed = true; // Iterate through all source buffers to find the smallest term for (int i = 0; i < blockCount; ++i) { // If the current block is completely consumed, ignore this buffer if (srcBuffer[i].IsCompletelyRead()) { continue; } // This block is not completely consumed allPortionsConsumed = false; // Get the current term if (terms[i].Equals("")) { int len = srcBuffer[i].GetInt(); terms[i] = srcBuffer[i].GetString(len); } // Find lexicographically smallest term if (minTerm.Equals("") || minTerm.CompareTo(terms[i]) > 0) { minTerm = terms[i]; } } // Stop if all portions have been merged if (allPortionsConsumed) { // Write the remaining data on destination buffer to disk desBuffer.WriteToDiskOnRequest(); break; } /* Store the smallest term's postings list to destination buffer */ // Store the term desBuffer.Store(BitConverter.GetBytes(minTerm.Length)); for (int i = 0; i < minTerm.Length; ++i) { desBuffer.Store(BitConverter.GetBytes(minTerm[i])); } // Store the offset dictBuffer.Store(BitConverter.GetBytes(offset)); offset += 4 + (minTerm.Length << 1); // Store doc frequency int docFrequency = 0; for (int i = 0; i < blockCount; ++i) { if (terms[i] == minTerm) { df[i] = srcBuffer[i].GetInt(); docFrequency += df[i]; } } offset += 4 + (docFrequency << 2); desBuffer.Store(BitConverter.GetBytes(docFrequency)); // Store postings list for (int i = 0; i < blockCount; ++i) { if (terms[i] == minTerm) { double idf = 1 + Math.Log10(docCount / (df[i] >> 1)); for (int j = 0; j < (df[i] >> 1); ++j) { int docId = srcBuffer[i].GetInt(); int f = srcBuffer[i].GetInt(); double tf_idf = (1 + Math.Log10(f)) * idf; if (!length.ContainsKey(docId)) { length.Add(docId, 0); } length[docId] += tf_idf * tf_idf; desBuffer.Store(BitConverter.GetBytes(docId)); desBuffer.Store(BitConverter.GetBytes(f)); } terms[i] = ""; } } } dictBuffer.WriteToDiskOnRequest(); // Write the lengths of all document vectors to disk DestinationBuffer lenBuffer = new DestinationBuffer(Disk.DISKBLOCK_SIZE, outFileName + "_length.txt"); foreach (KeyValuePair <int, double> docLen in length) { lenBuffer.Store(BitConverter.GetBytes(docLen.Key)); lenBuffer.Store(BitConverter.GetBytes(docLen.Value)); } lenBuffer.WriteToDiskOnRequest(); return; }