private List<long> DoPersist(TreeWrite sequence, int tryCount) { // NOTE: nodes are written in order of branches and then leaf nodes. All // branch nodes and leafs are grouped together. // The list of nodes to be allocated, IList<ITreeNode> allBranches = sequence.BranchNodes; IList<ITreeNode> allLeafs = sequence.LeafNodes; List<ITreeNode> nodes = new List<ITreeNode>(allBranches.Count + allLeafs.Count); nodes.AddRange(allBranches); nodes.AddRange(allLeafs); int sz = nodes.Count; // The list of allocated referenced for the nodes, DataAddress[] refs = new DataAddress[sz]; long[] outRefs = new long[sz]; MessageStream allocateMessage = new MessageStream(MessageType.Request); // Make a connection with the manager server, IMessageProcessor manager = connector.Connect(managerAddress, ServiceType.Manager); // Allocate the space first, for (int i = 0; i < sz; ++i) { ITreeNode node = nodes[i]; RequestMessage request = new RequestMessage("allocateNode"); // Is it a branch node? if (node is TreeBranch) { // Branch nodes are 1K in size, request.Arguments.Add(1024); } else { // Leaf nodes are 4k in size, request.Arguments.Add(4096); } allocateMessage.AddMessage(request); } // The result of the set of allocations, MessageStream resultStream = (MessageStream) manager.Process(allocateMessage); //DEBUG: ++network_comm_count; // The unique list of blocks, List<long> uniqueBlocks = new List<long>(); // Parse the result stream one message at a time, the order will be the // order of the allocation messages, int n = 0; foreach (ResponseMessage m in resultStream) { if (m.HasError) throw m.Error.AsException(); DataAddress addr = (DataAddress) m.Arguments[0].Value; refs[n] = addr; // Make a list of unique block identifiers, if (!uniqueBlocks.Contains(addr.BlockId)) { uniqueBlocks.Add(addr.BlockId); } ++n; } // Get the block to server map for each of the blocks, IDictionary<long, IList<BlockServerElement>> blockToServerMap = GetServersForBlock(uniqueBlocks); // Make message streams for each unique block int ubid_count = uniqueBlocks.Count; MessageStream[] ubidStream = new MessageStream[ubid_count]; for (int i = 0; i < ubidStream.Length; ++i) { ubidStream[i] = new MessageStream(MessageType.Request); } // Scan all the blocks and create the message streams, for (int i = 0; i < sz; ++i) { byte[] nodeBuf; ITreeNode node = nodes[i]; // Is it a branch node? if (node is TreeBranch) { TreeBranch branch = (TreeBranch)node; // Make a copy of the branch (NOTE; we Clone() the array here). long[] curNodeData = (long[])branch.ChildPointers.Clone(); int curNdsz = branch.DataSize; branch = new TreeBranch(refs[i].Value, curNodeData, curNdsz); // The number of children int chsz = branch.ChildCount; // For each child, if it's a heap node, look up the child id and // reference map in the sequence and set the reference accordingly, for (int o = 0; o < chsz; ++o) { long childRef = branch.GetChild(o); if (childRef < 0) { // The ref is currently on the heap, so adjust accordingly int ref_id = sequence.LookupRef(i, o); branch.SetChildOverride(o, refs[ref_id].Value); } } // Turn the branch into a 'node_buf' byte[] array object for // serialization. long[] nodeData = branch.ChildPointers; int ndsz = branch.DataSize; MemoryStream bout = new MemoryStream(1024); BinaryWriter dout = new BinaryWriter(bout, Encoding.Unicode); dout.Write(BranchType); dout.Write(ndsz); for (int o = 0; o < ndsz; ++o) { dout.Write(nodeData[o]); } dout.Flush(); // Turn it into a byte array, nodeBuf = bout.ToArray(); // Put this branch into the local cache, networkCache.SetNode(refs[i], branch); } else { // If it's a leaf node, TreeLeaf leaf = (TreeLeaf)node; int lfsz = leaf.Length; nodeBuf = new byte[lfsz + 6]; // Technically, we could comment these next two lines out. ByteBuffer.WriteInt2(LeafType, nodeBuf, 0); ByteBuffer.WriteInt4(lfsz, nodeBuf, 2); leaf.Read(0, nodeBuf, 6, lfsz); // Put this leaf into the local cache, leaf = new ByteArrayTreeLeaf(refs[i].Value, nodeBuf); networkCache.SetNode(refs[i], leaf); } // The DataAddress this node is being written to, DataAddress address = refs[i]; // Get the block id, long blockId = address.BlockId; int bid = uniqueBlocks.IndexOf(blockId); RequestMessage request = new RequestMessage("writeToBlock"); request.Arguments.Add(address); request.Arguments.Add(nodeBuf); request.Arguments.Add(0); request.Arguments.Add(nodeBuf.Length); ubidStream[bid].AddMessage(request); // Update 'outRefs' array, outRefs[i] = refs[i].Value; } // A log of successfully processed operations, List<object> successProcess = new List<object>(64); // Now process the streams on the servers, for (int i = 0; i < ubidStream.Length; ++i) { // The output message, MessageStream requestMessageStream = ubidStream[i]; // Get the servers this message needs to be sent to, long block_id = uniqueBlocks[i]; IList<BlockServerElement> blockServers = blockToServerMap[block_id]; // Format a message for writing this node out, int bssz = blockServers.Count; IMessageProcessor[] blockServerProcs = new IMessageProcessor[bssz]; // Make the block server connections, for (int o = 0; o < bssz; ++o) { IServiceAddress address = blockServers[o].Address; blockServerProcs[o] = connector.Connect(address, ServiceType.Block); MessageStream responseMessageStream = (MessageStream) blockServerProcs[o].Process(requestMessageStream); //DEBUG: ++network_comm_count; if (responseMessageStream.HasError) { // If this is an error, we need to report the failure to the // manager server, ReportBlockServerFailure(address); // Remove the block id from the server list cache, networkCache.RemoveServers(block_id); // Rollback any server writes already successfully made, for (int p = 0; p < successProcess.Count; p += 2) { IServiceAddress blockAddress = (IServiceAddress) successProcess[p]; MessageStream toRollback = (MessageStream) successProcess[p + 1]; List<DataAddress> rollbackNodes = new List<DataAddress>(128); foreach(Message rm in toRollback) { DataAddress raddr = (DataAddress) rm.Arguments[0].Value; rollbackNodes.Add(raddr); } // Create the rollback message, RequestMessage rollbackRequest = new RequestMessage("rollbackNodes"); rollbackRequest.Arguments.Add(rollbackNodes.ToArray()); // Send it to the block server, Message responseMessage = connector.Connect(blockAddress, ServiceType.Block).Process(rollbackRequest); //DEBUG: ++network_comm_count; // If rollback generated an error we throw the error now // because this likely is a serious network error. if (responseMessage.HasError) throw new NetworkException("Rollback wrote failed: " + responseMessage.ErrorMessage); } // Retry, if (tryCount > 0) return DoPersist(sequence, tryCount - 1); // Otherwise we fail the write throw new NetworkException(responseMessageStream.ErrorMessage); } // If we succeeded without an error, add to the log successProcess.Add(address); successProcess.Add(requestMessageStream); } } // Return the references, return new List<long>(outRefs); }
private NodeId[] InternalPersist(TreeWrite sequence, int tryCount) { // NOTE: nodes are written in order of branches and then leaf nodes. All // branch nodes and leafs are grouped together. // The list of nodes to be allocated, IList<ITreeNode> allBranches = sequence.BranchNodes; IList<ITreeNode> allLeafs = sequence.LeafNodes; List<ITreeNode> nodes = new List<ITreeNode>(allBranches.Count + allLeafs.Count); nodes.AddRange(allBranches); nodes.AddRange(allLeafs); int sz = nodes.Count; // The list of allocated referenced for the nodes, DataAddress[] refs = new DataAddress[sz]; NodeId[] outNodeIds = new NodeId[sz]; MessageStream allocateMessageStream = new MessageStream(); // Allocate the space first, for (int i = 0; i < sz; ++i) { ITreeNode node = nodes[i]; // Is it a branch node? if (node is TreeBranch) { // Branch nodes are 1K in size, allocateMessageStream.AddMessage(new Message("allocateNode", 1024)); } // Otherwise, it must be a leaf node, else { // Leaf nodes are 4k in size, allocateMessageStream.AddMessage(new Message("allocateNode", 4096)); } } // Process a command on the manager, IEnumerable<Message> resultStream = ProcessManager(allocateMessageStream); // The unique list of blocks, List<BlockId> uniqueBlocks = new List<BlockId>(); // Parse the result stream one message at a time, the order will be the // order of the allocation messages, int n = 0; foreach (Message m in resultStream) { if (m.HasError) throw new ApplicationException(m.ErrorMessage); DataAddress addr = (DataAddress) m.Arguments[0].Value; refs[n] = addr; // Make a list of unique block identifiers, if (!uniqueBlocks.Contains(addr.BlockId)) { uniqueBlocks.Add(addr.BlockId); } ++n; } // Get the block to server map for each of the blocks, IDictionary<BlockId, IList<BlockServerElement>> blockToServerMap = GetServerListForBlocks(uniqueBlocks); // Make message streams for each unique block int ubidCount = uniqueBlocks.Count; MessageStream[] ubidStream = new MessageStream[ubidCount]; for (int i = 0; i < ubidStream.Length; ++i) { ubidStream[i] = new MessageStream(); } // Scan all the blocks and create the message streams, for (int i = 0; i < sz; ++i) { byte[] nodeBuf; ITreeNode node = nodes[i]; // Is it a branch node? if (node is TreeBranch) { TreeBranch branch = (TreeBranch) node; // Make a copy of the branch (NOTE; we clone() the array here). long[] curNodeData = (long[]) branch.NodeData.Clone(); int curNdsz = branch.NodeDataSize; branch = new TreeBranch(refs[i].Value, curNodeData, curNdsz); // The number of children int chsz = branch.ChildCount; // For each child, if it's a heap node, look up the child id and // reference map in the sequence and set the reference accordingly, for (int o = 0; o < chsz; ++o) { NodeId childId = branch.GetChild(o); if (childId.IsInMemory) { // The ref is currently on the heap, so adjust accordingly int refId = sequence.LookupRef(i, o); branch.SetChildOverride(refs[refId].Value, o); } } // Turn the branch into a 'node_buf' byte[] array object for // serialization. long[] nodeData = branch.NodeData; int ndsz = branch.NodeDataSize; MemoryStream bout = new MemoryStream(1024); BinaryWriter dout = new BinaryWriter(bout); dout.Write(StoreBranchType); dout.Write((short) 0); // Reserved for future dout.Write(0); // The crc32 checksum will be written here, dout.Write(ndsz); for (int o = 0; o < ndsz; ++o) { dout.Write(nodeData[o]); } dout.Flush(); // Turn it into a byte array, nodeBuf = bout.ToArray(); // Write the crc32 of the data, Crc32 checksum = new Crc32(); checksum.ComputeHash(nodeBuf, 8, nodeBuf.Length - 8); ByteBuffer.WriteInt4((int) checksum.CrcValue, nodeBuf, 4); // Put this branch into the local cache, networkCache.SetNode(refs[i], branch); } // If it's a leaf node, else { TreeLeaf leaf = (TreeLeaf) node; int lfsz = leaf.Length; nodeBuf = new byte[lfsz + 12]; // Format the data, ByteBuffer.WriteInt2(StoreLeafType, nodeBuf, 0); ByteBuffer.WriteInt2(0, nodeBuf, 2); // Reserved for future ByteBuffer.WriteInt4(lfsz, nodeBuf, 8); leaf.Read(0, nodeBuf, 12, lfsz); // Calculate and set the checksum, Crc32 checksum = new Crc32(); checksum.ComputeHash(nodeBuf, 8, nodeBuf.Length - 8); ByteBuffer.WriteInt4((int) checksum.CrcValue, nodeBuf, 4); // Put this leaf into the local cache, leaf = new MemoryTreeLeaf(refs[i].Value, nodeBuf); networkCache.SetNode(refs[i], leaf); } // The DataAddress this node is being written to, DataAddress address = refs[i]; // Get the block id, BlockId blockId = address.BlockId; int bid = uniqueBlocks.IndexOf(blockId); ubidStream[bid].AddMessage(new Message("writeToBlock", address, nodeBuf, 0, nodeBuf.Length)); // Update 'out_refs' array, outNodeIds[i] = refs[i].Value; } // A log of successfully processed operations, List<object> successProcess = new List<object>(64); // Now process the streams on the servers, for (int i = 0; i < ubidStream.Length; ++i) { // The output message, MessageStream outputStream = ubidStream[i]; // Get the servers this message needs to be sent to, BlockId blockId = uniqueBlocks[i]; IList<BlockServerElement> blockServers = blockToServerMap[blockId]; // Format a message for writing this node out, int bssz = blockServers.Count; IMessageProcessor[] blockServerProcs = new IMessageProcessor[bssz]; // Make the block server connections, for (int o = 0; o < bssz; ++o) { IServiceAddress address = blockServers[o].Address; blockServerProcs[o] = connector.Connect(address, ServiceType.Block); IEnumerable<Message> inputStream = blockServerProcs[o].Process(outputStream); ++NetworkCommCount; foreach (Message m in inputStream) { if (m.HasError) { // If this is an error, we need to report the failure to the // manager server, ReportBlockServerFailure(address); // Remove the block id from the server list cache, networkCache.RemoveServersWithBlock(blockId); // Rollback any server writes already successfully made, for (int p = 0; p < successProcess.Count; p += 2) { IServiceAddress blocksAddr = (IServiceAddress) successProcess[p]; MessageStream toRollback = (MessageStream) successProcess[p + 1]; List<DataAddress> rollbackNodes = new List<DataAddress>(128); foreach (Message rm in toRollback) { DataAddress raddr = (DataAddress) rm.Arguments[0].Value; rollbackNodes.Add(raddr); } // Create the rollback message, MessageStream rollbackMsg = new MessageStream(); rollbackMsg.AddMessage(new Message("rollbackNodes", new object[] {rollbackNodes.ToArray()})); // Send it to the block server, IEnumerable<Message> responseStream = connector.Connect(blocksAddr, ServiceType.Block).Process(rollbackMsg); ++NetworkCommCount; foreach (Message rbm in responseStream) { // If rollback generated an error we throw the error now // because this likely is a serious network error. if (rbm.HasError) { throw new NetworkWriteException("Write failed (rollback failed): " + rbm.ErrorMessage); } } } // Retry, if (tryCount > 0) return InternalPersist(sequence, tryCount - 1); // Otherwise we fail the write throw new NetworkWriteException(m.ErrorMessage); } } // If we succeeded without an error, add to the log successProcess.Add(address); successProcess.Add(outputStream); } } // Return the references, return outNodeIds; }