private List<long> DoPersist(TreeWrite sequence, int tryCount) { // NOTE: nodes are written in order of branches and then leaf nodes. All // branch nodes and leafs are grouped together. // The list of nodes to be allocated, IList<ITreeNode> allBranches = sequence.BranchNodes; IList<ITreeNode> allLeafs = sequence.LeafNodes; List<ITreeNode> nodes = new List<ITreeNode>(allBranches.Count + allLeafs.Count); nodes.AddRange(allBranches); nodes.AddRange(allLeafs); int sz = nodes.Count; // The list of allocated referenced for the nodes, DataAddress[] refs = new DataAddress[sz]; long[] outRefs = new long[sz]; MessageStream allocateMessage = new MessageStream(MessageType.Request); // Make a connection with the manager server, IMessageProcessor manager = connector.Connect(managerAddress, ServiceType.Manager); // Allocate the space first, for (int i = 0; i < sz; ++i) { ITreeNode node = nodes[i]; RequestMessage request = new RequestMessage("allocateNode"); // Is it a branch node? if (node is TreeBranch) { // Branch nodes are 1K in size, request.Arguments.Add(1024); } else { // Leaf nodes are 4k in size, request.Arguments.Add(4096); } allocateMessage.AddMessage(request); } // The result of the set of allocations, MessageStream resultStream = (MessageStream) manager.Process(allocateMessage); //DEBUG: ++network_comm_count; // The unique list of blocks, List<long> uniqueBlocks = new List<long>(); // Parse the result stream one message at a time, the order will be the // order of the allocation messages, int n = 0; foreach (ResponseMessage m in resultStream) { if (m.HasError) throw m.Error.AsException(); DataAddress addr = (DataAddress) m.Arguments[0].Value; refs[n] = addr; // Make a list of unique block identifiers, if (!uniqueBlocks.Contains(addr.BlockId)) { uniqueBlocks.Add(addr.BlockId); } ++n; } // Get the block to server map for each of the blocks, IDictionary<long, IList<BlockServerElement>> blockToServerMap = GetServersForBlock(uniqueBlocks); // Make message streams for each unique block int ubid_count = uniqueBlocks.Count; MessageStream[] ubidStream = new MessageStream[ubid_count]; for (int i = 0; i < ubidStream.Length; ++i) { ubidStream[i] = new MessageStream(MessageType.Request); } // Scan all the blocks and create the message streams, for (int i = 0; i < sz; ++i) { byte[] nodeBuf; ITreeNode node = nodes[i]; // Is it a branch node? if (node is TreeBranch) { TreeBranch branch = (TreeBranch)node; // Make a copy of the branch (NOTE; we Clone() the array here). long[] curNodeData = (long[])branch.ChildPointers.Clone(); int curNdsz = branch.DataSize; branch = new TreeBranch(refs[i].Value, curNodeData, curNdsz); // The number of children int chsz = branch.ChildCount; // For each child, if it's a heap node, look up the child id and // reference map in the sequence and set the reference accordingly, for (int o = 0; o < chsz; ++o) { long childRef = branch.GetChild(o); if (childRef < 0) { // The ref is currently on the heap, so adjust accordingly int ref_id = sequence.LookupRef(i, o); branch.SetChildOverride(o, refs[ref_id].Value); } } // Turn the branch into a 'node_buf' byte[] array object for // serialization. long[] nodeData = branch.ChildPointers; int ndsz = branch.DataSize; MemoryStream bout = new MemoryStream(1024); BinaryWriter dout = new BinaryWriter(bout, Encoding.Unicode); dout.Write(BranchType); dout.Write(ndsz); for (int o = 0; o < ndsz; ++o) { dout.Write(nodeData[o]); } dout.Flush(); // Turn it into a byte array, nodeBuf = bout.ToArray(); // Put this branch into the local cache, networkCache.SetNode(refs[i], branch); } else { // If it's a leaf node, TreeLeaf leaf = (TreeLeaf)node; int lfsz = leaf.Length; nodeBuf = new byte[lfsz + 6]; // Technically, we could comment these next two lines out. ByteBuffer.WriteInt2(LeafType, nodeBuf, 0); ByteBuffer.WriteInt4(lfsz, nodeBuf, 2); leaf.Read(0, nodeBuf, 6, lfsz); // Put this leaf into the local cache, leaf = new ByteArrayTreeLeaf(refs[i].Value, nodeBuf); networkCache.SetNode(refs[i], leaf); } // The DataAddress this node is being written to, DataAddress address = refs[i]; // Get the block id, long blockId = address.BlockId; int bid = uniqueBlocks.IndexOf(blockId); RequestMessage request = new RequestMessage("writeToBlock"); request.Arguments.Add(address); request.Arguments.Add(nodeBuf); request.Arguments.Add(0); request.Arguments.Add(nodeBuf.Length); ubidStream[bid].AddMessage(request); // Update 'outRefs' array, outRefs[i] = refs[i].Value; } // A log of successfully processed operations, List<object> successProcess = new List<object>(64); // Now process the streams on the servers, for (int i = 0; i < ubidStream.Length; ++i) { // The output message, MessageStream requestMessageStream = ubidStream[i]; // Get the servers this message needs to be sent to, long block_id = uniqueBlocks[i]; IList<BlockServerElement> blockServers = blockToServerMap[block_id]; // Format a message for writing this node out, int bssz = blockServers.Count; IMessageProcessor[] blockServerProcs = new IMessageProcessor[bssz]; // Make the block server connections, for (int o = 0; o < bssz; ++o) { IServiceAddress address = blockServers[o].Address; blockServerProcs[o] = connector.Connect(address, ServiceType.Block); MessageStream responseMessageStream = (MessageStream) blockServerProcs[o].Process(requestMessageStream); //DEBUG: ++network_comm_count; if (responseMessageStream.HasError) { // If this is an error, we need to report the failure to the // manager server, ReportBlockServerFailure(address); // Remove the block id from the server list cache, networkCache.RemoveServers(block_id); // Rollback any server writes already successfully made, for (int p = 0; p < successProcess.Count; p += 2) { IServiceAddress blockAddress = (IServiceAddress) successProcess[p]; MessageStream toRollback = (MessageStream) successProcess[p + 1]; List<DataAddress> rollbackNodes = new List<DataAddress>(128); foreach(Message rm in toRollback) { DataAddress raddr = (DataAddress) rm.Arguments[0].Value; rollbackNodes.Add(raddr); } // Create the rollback message, RequestMessage rollbackRequest = new RequestMessage("rollbackNodes"); rollbackRequest.Arguments.Add(rollbackNodes.ToArray()); // Send it to the block server, Message responseMessage = connector.Connect(blockAddress, ServiceType.Block).Process(rollbackRequest); //DEBUG: ++network_comm_count; // If rollback generated an error we throw the error now // because this likely is a serious network error. if (responseMessage.HasError) throw new NetworkException("Rollback wrote failed: " + responseMessage.ErrorMessage); } // Retry, if (tryCount > 0) return DoPersist(sequence, tryCount - 1); // Otherwise we fail the write throw new NetworkException(responseMessageStream.ErrorMessage); } // If we succeeded without an error, add to the log successProcess.Add(address); successProcess.Add(requestMessageStream); } } // Return the references, return new List<long>(outRefs); }
public IList<ITreeNode> FetchNodes(long[] nids) { // The number of nodes, int node_count = nids.Length; // The array of read nodes, ITreeNode[] result_nodes = new ITreeNode[node_count]; // Resolve special nodes first, { int i = 0; foreach (long nodeId in nids) { if ((nodeId & 0x01000000000000000L) != 0) result_nodes[i] = SparseLeafNode.Create(nodeId); ++i; } } // Group all the nodes to the same block, List<long> uniqueBlocks = new List<long>(); List<List<long>> uniqueBlockList = new List<List<long>>(); { int i = 0; foreach (long node_ref in nids) { // If it's not a special node, if ((node_ref & 0x01000000000000000L) == 0) { // Get the block id and add it to the list of unique blocks, DataAddress address = new DataAddress(node_ref); // Check if the node is in the local cache, ITreeNode node = networkCache.GetNode(address); if (node != null) { result_nodes[i] = node; } else { // Not in the local cache so we need to bundle this up in a node // request on the block servers, // Group this node request by the block identifier long blockId = address.BlockId; int ind = uniqueBlocks.IndexOf(blockId); if (ind == -1) { ind = uniqueBlocks.Count; uniqueBlocks.Add(blockId); uniqueBlockList.Add(new List<long>()); } List<long> blist = uniqueBlockList[ind]; blist.Add(node_ref); } } ++i; } } // Exit early if no blocks, if (uniqueBlocks.Count == 0) return result_nodes; // Resolve server records for the given block identifiers, IDictionary<long, IList<BlockServerElement>> servers_map = GetServersForBlock(uniqueBlocks); // The result nodes list, List<ITreeNode> nodes = new List<ITreeNode>(); // For each unique block list, foreach (List<long> blist in uniqueBlockList) { // Make a block server request for each node in the block, MessageStream block_server_msg = new MessageStream(MessageType.Request); long block_id = -1; foreach (long node_ref in blist) { DataAddress address = new DataAddress(node_ref); RequestMessage request = new RequestMessage("readFromBlock"); request.Arguments.Add(address); block_server_msg.AddMessage(request); block_id = address.BlockId; } if (block_id == -1) throw new ApplicationException("block_id == -1"); // Get the shuffled list of servers the block is stored on, IList<BlockServerElement> servers = servers_map[block_id]; // Go through the servers one at a time to fetch the block, bool success = false; for (int z = 0; z < servers.Count && !success; ++z) { BlockServerElement server = servers[z]; // If the server is up, if (server.IsStatusUp) { // Open a connection with the block server, IMessageProcessor block_server_proc = connector.Connect(server.Address, ServiceType.Block); MessageStream message_in = (MessageStream) block_server_proc.Process(block_server_msg); // DEBUG: ++networkCommCount; // DEBUG: ++networkFetchCommCount; bool is_error = false; bool severe_error = false; // Turn each none-error message into a node foreach (ResponseMessage m in message_in) { if (m.HasError) { // See if this error is a block read error. If it is, we don't // tell the manager server to lock this server out completely. bool is_block_read_error = m.Error.Source.Equals("Deveel.Data.Net.BlockReadException"); if (!is_block_read_error) { // If it's something other than a block read error, we mark // this error as severe, severe_error = true; } is_error = true; } else if (!is_error) { // The reply contains the block of data read. NodeSet node_set = (NodeSet)m.Arguments[0].Value; // Decode the node items into node objects, IEnumerator<Node> item_iterator = node_set.GetEnumerator(); while (item_iterator.MoveNext()) { // Get the node item, Node node_item = item_iterator.Current; long node_ref = node_item.Id; DataAddress address = new DataAddress(node_ref); // Wrap around a buffered DataInputStream for reading values // from the store. BinaryReader input = new BinaryReader(node_item.Input, Encoding.Unicode); short node_type = input.ReadInt16(); ITreeNode read_node; // Is the node type a leaf node? if (node_type == LeafType) { // Read the key int leaf_size = input.ReadInt32(); byte[] buf = ReadNodeAsBuffer(node_item); if (buf == null) { buf = new byte[leaf_size + 6]; input.Read(buf, 6, leaf_size); // Technically, we could comment these next two lines out. ByteBuffer.WriteInt2(node_type, buf, 0); ByteBuffer.WriteInt4(leaf_size, buf, 2); } // Create a leaf that's mapped to this data read_node = new ByteArrayTreeLeaf(node_ref, buf); ; } // Is the node type a branch node? else if (node_type == BranchType) { // Note that the entire branch is loaded into memory, int child_data_size = input.ReadInt32(); long[] data_arr = new long[child_data_size]; for (int n = 0; n < child_data_size; ++n) { data_arr[n] = input.ReadInt64(); } // Create the branch node, read_node = new TreeBranch(node_ref, data_arr, child_data_size); } else { throw new InvalidDataState("Unknown node type: " + node_type, address); } // Is the node already in the list? If so we don't add it. if (!IsInNodeList(node_ref, nodes)) { // Put the read node in the cache and add it to the 'nodes' // list. networkCache.SetNode(address, read_node); nodes.Add(read_node); } } } } // If there was no error while reading the result, we assume the node // requests were successfully read. if (is_error == false) { success = true; } else { if (severe_error) { // If this is an error, we need to report the failure to the // manager server, ReportBlockServerFailure(server.Address); // Remove the block id from the server list cache, networkCache.RemoveServers(block_id); } else { // Otherwise, not a severe error (probably a corrupt block on a // server), so shuffle the server list for this block_id so next // time there's less chance of hitting this bad block. IList<BlockServerElement> srvs = networkCache.GetServers(block_id); List<BlockServerElement> server_list = new List<BlockServerElement>(); server_list.AddRange(srvs); CollectionsUtil.Shuffle(server_list); networkCache.SetServers(block_id, server_list, 15 * 60 * 1000); } } } } // If the nodes were not successfully read, we generate an exception, if (!success) { // Remove from the cache, networkCache.RemoveServers(block_id); throw new ApplicationException("Unable to fetch node from block server"); } } int sz = nodes.Count; if (sz == 0) throw new ApplicationException("Empty nodes list"); for (int i = 0; i < sz; ++i) { ITreeNode node = nodes[i]; long node_ref = node.Id; for (int n = 0; n < nids.Length; ++n) { if (nids[n] == node_ref) result_nodes[n] = node; } } // Check the result_nodes list is completely populated, for (int n = 0; n < result_nodes.Length; ++n) { if (result_nodes[n] == null) throw new ApplicationException("Assertion failed: result_nodes not completely populated."); } return result_nodes; }