/// <summary> /// Inserts the given triple into the bucket, appending it to the bucket file. /// </summary> /// <param name="triple">The triple to insert.</param> public void Insert(Triple <Atom, Atom, Atom> triple) { if (m_isBatchInserting) { // // if we are batch inserting, then use the binary writer used for that purpose if (m_isMiniBucket) { TripleSerializer.Write(m_batchWriter, triple, m_order.Primary); } else { TripleSerializer.Write(m_batchWriter, triple); } } else { // // for regular insertion, just open the bucket file, append, and close using (var sw = new BinaryWriter(File.Open(m_fileName, FileMode.Append, FileAccess.Write, FileShare.None))) { if (m_isMiniBucket) { TripleSerializer.Write(sw, triple, m_order.Primary); } else { TripleSerializer.Write(sw, triple); } } } }
/// <summary> /// Sort a TripleT bucket in external memory. The bucket file is overwritten. /// </summary> /// <param name="bucket">The bucket to sort.</param> /// <param name="maxTriplesInMemory">The maximum number of triples to keep in memory at any point.</param> public static void SortBucket(Bucket bucket, int maxTriplesInMemory) { // // step 1: sort in chuncks that fit in the memory we have. the size of each chunk is // equal to the amount of memory provided (given in numbers of triples) // // temp files used to store the sorted chunks var chunkFiles = new List <string>(); // // IComparer used to compare two triples. // the priorities are determined by the type of bucket and in what order the bucket // should be sorted var sorter = new TripleComparer(bucket.SortOrder.Primary, bucket.SortOrder.Secondary, bucket.SortOrder.Tertiary); // // open the bucket for reading, using a regular triple cursor using (var cursor = bucket.OpenRead()) { // // counter to keep track of how much triples we've read var i = 0; // // current chunk, represented as an array of triples var chunk = new Triple <Atom, Atom, Atom> [maxTriplesInMemory]; // // we read as long as we have not yet reached the end of the bucket while (cursor.HasNext) { // // if we still have space in the chunk, read the next triple from the bucket if (i < maxTriplesInMemory) { chunk[i] = cursor.Next(); i++; } // // if not, we need to sort and output the current chunk. // for this part sorting is done in-memory else { // // use default array sorting using the IComparer we have defined earlier Array.Sort(chunk, sorter); // // generate a file name for the chunk. it needs to be unique for obvious // reasons var fileName = String.Format("~{0}.chunk.tmp", Generator.GetRandomFilename(12)); // // add the file to the list of chunk files. we'll need it later chunkFiles.Add(fileName); // // open a binary writer and write the sorted triples from the current chunk // to the chunk file using (var sw = new BinaryWriter(File.Open(fileName, FileMode.Create, FileAccess.Write, FileShare.None))) { for (int j = 0; j < chunk.Length; j++) { if (chunk[j] != null) { TripleSerializer.Write(sw, chunk[j]); } } } // // clear the chunk array as it might not get completely overwritten during // the next pass Array.Clear(chunk, 0, chunk.Length); // // reset the triples-read counter i = 0; } } // // here, we have read the entire bucket, but there still might be some triples left // in the last chunk that we need to handle. // the code to do this is duplicated from the else block above. if (i > 0) { Array.Sort(chunk, sorter); var fileName = String.Format("~{0}.chunk.tmp", Generator.GetRandomFilename(12)); chunkFiles.Add(fileName); using (var sw = new BinaryWriter(File.Open(fileName, FileMode.Create, FileAccess.Write, FileShare.None))) { for (int j = 0; j < chunk.Length; j++) { if (chunk[j] != null) { TripleSerializer.Write(sw, chunk[j]); } } } } } // // force garbage collection GC.Collect(); // // step 2: perform k-way merge sort on the chunks. now that we have the sorted chunks, // we need to merge them and write out the final result. // // the number of pages used for reading must be equal to the number of chunk files var numReadPages = chunkFiles.Count; // // the total number of pages has one additional page for writing the output to var numPages = numReadPages + 1; // // the size of each page is the number of triples that may be kept in memory // devided by the number of pages. WARNING: if the amount of memory available is // too small in relation to the size of the input bucket, then there may be too // many chunks, hence too many pages to fit in memory when each page should have // at least space for 1 triple. var pageSize = maxTriplesInMemory / numPages; // // the buffer simply contains the pages var buffer = new TripleBuffer(numPages, pageSize); // // the last page is the one used for writing the output to. this variable is a shortcut // that allows us to say buffer[pageOutputId] instead of buffer[numPages - 1]. var pageOutputId = numPages - 1; // // the page cursors are simply integers denoting the position of the reading pointers // in each of the pages var pageCursors = new int[numPages]; // // the chunk cursors are triple cursors pointing to each of the chunks from the // previous step var chunkCursors = new TripleCursor[numReadPages]; // // these variables keep the currenly smallest triple found (e.g. the one that needs to // be written next to the output file to maintain the sort order), and the Id of the // chunk this triple resides in Triple <Atom, Atom, Atom> minTriple = null; var minChunkId = -1; // // we start by initializing the cursors and reading the initial parts of all the chunks // into their respective buffer pages for (int i = 0; i < numReadPages; i++) { pageCursors[i] = 0; chunkCursors[i] = new TripleCursor(File.Open(chunkFiles[i], FileMode.Open, FileAccess.Read, FileShare.Read)); ReadToPage(chunkCursors[i], buffer[i]); } // // we open a binary writer to write the output (the sorted bucket) to. note we just // overwrite the existing bucket. this is possible because all information from the // original bucket is now duplicated in the sorted chunks anyway. using (var sw = new BinaryWriter(File.Open(bucket.FileName, FileMode.Create, FileAccess.Write, FileShare.None))) { // // entering the main loop of the merge that we will remain in for as long as there // are triples in any of the pages used for reading do { // // reset the next triple in the sort order to null. this is used to determine // if can leave the main loop. minTriple = null; // // here we find the next triple in the sort order (the smallest triple) in any // of the read pages for (int i = 0; i < numReadPages; i++) { // // fetch the current smallest triple in this page. it will always be the // one the respective cursor is pointing to. var t = buffer[i][pageCursors[i]]; // // it's possible the page does not contain (any more) triples. if so we // can just ignore it. for this we do assume that if we read a null from // position i at some page, then all positions j > i on that same page are // also null if (t != null) { // // if the current smallest triple doesn't exist yet, then the one we've // just read is trivially the new smallest triple if (minTriple == null) { minTriple = t; minChunkId = i; } // // if a smallest triple does exist, we need to do a comparison first. // we use the same IComparer that we defined earlier else { var c = sorter.Compare(t, minTriple); if (c < 0) { minTriple = t; minChunkId = i; } } } } // // checking if a new smallest triple has been identified. if it is we need to // write it to the output page. if not, we don't need to do anything and will // exit the main loop. if (minTriple != null) { // // we increment the cursor belonging to the page where the next smallest // triple resides pageCursors[minChunkId]++; // // check if we have read all the triples on this page. if so, read the next // set of triples from the chunk file and reset the cursor if (pageCursors[minChunkId] >= pageSize) { ReadToPage(chunkCursors[minChunkId], buffer[minChunkId]); pageCursors[minChunkId] = 0; } // // put the next smallest triple in the output page buffer[pageOutputId][pageCursors[pageOutputId]] = minTriple; // // increment the output page cursor pageCursors[pageOutputId]++; // // check if the output page is full. if so, write its triples to the output // file, clear the page, and reset the cursor if (pageCursors[pageOutputId] >= pageSize) { for (int i = 0; i < pageSize; i++) { TripleSerializer.Write(sw, buffer[pageOutputId][i]); } buffer[pageOutputId].Clear(); pageCursors[pageOutputId] = 0; } } } while (minTriple != null); // // the triples from all the chunks have been read and sorted, but the last couple // of them may still reside on the output page. here we write these last ones to // the output file. for (int i = 0; i < pageSize; i++) { var t = buffer[pageOutputId][i]; if (t != null) { TripleSerializer.Write(sw, t); } else { break; } } } // // close the chunk cursors and delete the chunk files for (int i = 0; i < numReadPages; i++) { chunkCursors[i].Dispose(); File.Delete(chunkFiles[i]); } }