private NodeId[] InternalPersist(TreeWrite sequence, int tryCount) { // NOTE: nodes are written in order of branches and then leaf nodes. All // branch nodes and leafs are grouped together. // The list of nodes to be allocated, IList<ITreeNode> allBranches = sequence.BranchNodes; IList<ITreeNode> allLeafs = sequence.LeafNodes; List<ITreeNode> nodes = new List<ITreeNode>(allBranches.Count + allLeafs.Count); nodes.AddRange(allBranches); nodes.AddRange(allLeafs); int sz = nodes.Count; // The list of allocated referenced for the nodes, DataAddress[] refs = new DataAddress[sz]; NodeId[] outNodeIds = new NodeId[sz]; MessageStream allocateMessageStream = new MessageStream(); // Allocate the space first, for (int i = 0; i < sz; ++i) { ITreeNode node = nodes[i]; // Is it a branch node? if (node is TreeBranch) { // Branch nodes are 1K in size, allocateMessageStream.AddMessage(new Message("allocateNode", 1024)); } // Otherwise, it must be a leaf node, else { // Leaf nodes are 4k in size, allocateMessageStream.AddMessage(new Message("allocateNode", 4096)); } } // Process a command on the manager, IEnumerable<Message> resultStream = ProcessManager(allocateMessageStream); // The unique list of blocks, List<BlockId> uniqueBlocks = new List<BlockId>(); // Parse the result stream one message at a time, the order will be the // order of the allocation messages, int n = 0; foreach (Message m in resultStream) { if (m.HasError) throw new ApplicationException(m.ErrorMessage); DataAddress addr = (DataAddress) m.Arguments[0].Value; refs[n] = addr; // Make a list of unique block identifiers, if (!uniqueBlocks.Contains(addr.BlockId)) { uniqueBlocks.Add(addr.BlockId); } ++n; } // Get the block to server map for each of the blocks, IDictionary<BlockId, IList<BlockServerElement>> blockToServerMap = GetServerListForBlocks(uniqueBlocks); // Make message streams for each unique block int ubidCount = uniqueBlocks.Count; MessageStream[] ubidStream = new MessageStream[ubidCount]; for (int i = 0; i < ubidStream.Length; ++i) { ubidStream[i] = new MessageStream(); } // Scan all the blocks and create the message streams, for (int i = 0; i < sz; ++i) { byte[] nodeBuf; ITreeNode node = nodes[i]; // Is it a branch node? if (node is TreeBranch) { TreeBranch branch = (TreeBranch) node; // Make a copy of the branch (NOTE; we clone() the array here). long[] curNodeData = (long[]) branch.NodeData.Clone(); int curNdsz = branch.NodeDataSize; branch = new TreeBranch(refs[i].Value, curNodeData, curNdsz); // The number of children int chsz = branch.ChildCount; // For each child, if it's a heap node, look up the child id and // reference map in the sequence and set the reference accordingly, for (int o = 0; o < chsz; ++o) { NodeId childId = branch.GetChild(o); if (childId.IsInMemory) { // The ref is currently on the heap, so adjust accordingly int refId = sequence.LookupRef(i, o); branch.SetChildOverride(refs[refId].Value, o); } } // Turn the branch into a 'node_buf' byte[] array object for // serialization. long[] nodeData = branch.NodeData; int ndsz = branch.NodeDataSize; MemoryStream bout = new MemoryStream(1024); BinaryWriter dout = new BinaryWriter(bout); dout.Write(StoreBranchType); dout.Write((short) 0); // Reserved for future dout.Write(0); // The crc32 checksum will be written here, dout.Write(ndsz); for (int o = 0; o < ndsz; ++o) { dout.Write(nodeData[o]); } dout.Flush(); // Turn it into a byte array, nodeBuf = bout.ToArray(); // Write the crc32 of the data, Crc32 checksum = new Crc32(); checksum.ComputeHash(nodeBuf, 8, nodeBuf.Length - 8); ByteBuffer.WriteInt4((int) checksum.CrcValue, nodeBuf, 4); // Put this branch into the local cache, networkCache.SetNode(refs[i], branch); } // If it's a leaf node, else { TreeLeaf leaf = (TreeLeaf) node; int lfsz = leaf.Length; nodeBuf = new byte[lfsz + 12]; // Format the data, ByteBuffer.WriteInt2(StoreLeafType, nodeBuf, 0); ByteBuffer.WriteInt2(0, nodeBuf, 2); // Reserved for future ByteBuffer.WriteInt4(lfsz, nodeBuf, 8); leaf.Read(0, nodeBuf, 12, lfsz); // Calculate and set the checksum, Crc32 checksum = new Crc32(); checksum.ComputeHash(nodeBuf, 8, nodeBuf.Length - 8); ByteBuffer.WriteInt4((int) checksum.CrcValue, nodeBuf, 4); // Put this leaf into the local cache, leaf = new MemoryTreeLeaf(refs[i].Value, nodeBuf); networkCache.SetNode(refs[i], leaf); } // The DataAddress this node is being written to, DataAddress address = refs[i]; // Get the block id, BlockId blockId = address.BlockId; int bid = uniqueBlocks.IndexOf(blockId); ubidStream[bid].AddMessage(new Message("writeToBlock", address, nodeBuf, 0, nodeBuf.Length)); // Update 'out_refs' array, outNodeIds[i] = refs[i].Value; } // A log of successfully processed operations, List<object> successProcess = new List<object>(64); // Now process the streams on the servers, for (int i = 0; i < ubidStream.Length; ++i) { // The output message, MessageStream outputStream = ubidStream[i]; // Get the servers this message needs to be sent to, BlockId blockId = uniqueBlocks[i]; IList<BlockServerElement> blockServers = blockToServerMap[blockId]; // Format a message for writing this node out, int bssz = blockServers.Count; IMessageProcessor[] blockServerProcs = new IMessageProcessor[bssz]; // Make the block server connections, for (int o = 0; o < bssz; ++o) { IServiceAddress address = blockServers[o].Address; blockServerProcs[o] = connector.Connect(address, ServiceType.Block); IEnumerable<Message> inputStream = blockServerProcs[o].Process(outputStream); ++NetworkCommCount; foreach (Message m in inputStream) { if (m.HasError) { // If this is an error, we need to report the failure to the // manager server, ReportBlockServerFailure(address); // Remove the block id from the server list cache, networkCache.RemoveServersWithBlock(blockId); // Rollback any server writes already successfully made, for (int p = 0; p < successProcess.Count; p += 2) { IServiceAddress blocksAddr = (IServiceAddress) successProcess[p]; MessageStream toRollback = (MessageStream) successProcess[p + 1]; List<DataAddress> rollbackNodes = new List<DataAddress>(128); foreach (Message rm in toRollback) { DataAddress raddr = (DataAddress) rm.Arguments[0].Value; rollbackNodes.Add(raddr); } // Create the rollback message, MessageStream rollbackMsg = new MessageStream(); rollbackMsg.AddMessage(new Message("rollbackNodes", new object[] {rollbackNodes.ToArray()})); // Send it to the block server, IEnumerable<Message> responseStream = connector.Connect(blocksAddr, ServiceType.Block).Process(rollbackMsg); ++NetworkCommCount; foreach (Message rbm in responseStream) { // If rollback generated an error we throw the error now // because this likely is a serious network error. if (rbm.HasError) { throw new NetworkWriteException("Write failed (rollback failed): " + rbm.ErrorMessage); } } } // Retry, if (tryCount > 0) return InternalPersist(sequence, tryCount - 1); // Otherwise we fail the write throw new NetworkWriteException(m.ErrorMessage); } } // If we succeeded without an error, add to the log successProcess.Add(address); successProcess.Add(outputStream); } } // Return the references, return outNodeIds; }
public IList<ITreeNode> FetchNodes(NodeId[] nids) { // The number of nodes, int nodeCount = nids.Length; // The array of read nodes, ITreeNode[] resultNodes = new ITreeNode[nodeCount]; // Resolve special nodes first, { int i = 0; foreach (NodeId nodeId in nids) { if (nodeId.IsSpecial) { resultNodes[i] = nodeId.CreateSpecialTreeNode(); } ++i; } } // Group all the nodes to the same block, List<BlockId> uniqueBlocks = new List<BlockId>(); List<List<NodeId>> uniqueBlockList = new List<List<NodeId>>(); { int i = 0; foreach (NodeId nodeId in nids) { // If it's not a special node, if (!nodeId.IsSpecial) { // Get the block id and add it to the list of unique blocks, DataAddress address = new DataAddress(nodeId); // Check if the node is in the local cache, ITreeNode node = networkCache.GetNode(address); if (node != null) { resultNodes[i] = node; } else { // Not in the local cache so we need to bundle this up in a node // request on the block servers, // Group this node request by the block identifier BlockId blockId = address.BlockId; int ind = uniqueBlocks.IndexOf(blockId); if (ind == -1) { ind = uniqueBlocks.Count; uniqueBlocks.Add(blockId); uniqueBlockList.Add(new List<NodeId>()); } List<NodeId> blist = uniqueBlockList[ind]; blist.Add(nodeId); } } ++i; } } // Exit early if no blocks, if (uniqueBlocks.Count == 0) { return resultNodes; } // Resolve server records for the given block identifiers, IDictionary<BlockId, IList<BlockServerElement>> serversMap = GetServerListForBlocks(uniqueBlocks); // The result nodes list, List<ITreeNode> nodes = new List<ITreeNode>(); // Checksumming objects byte[] checksumBuf = null; Crc32 crc32 = null; // For each unique block list, foreach (List<NodeId> blist in uniqueBlockList) { // Make a block server request for each node in the block, MessageStream blockServerMsg = new MessageStream(); BlockId blockId = null; foreach (NodeId nodeId in blist) { DataAddress address = new DataAddress(nodeId); blockServerMsg.AddMessage(new Message("readFromBlock", address)); blockId = address.BlockId; } if (blockId == null) { throw new ApplicationException("block_id == null"); } // Get the shuffled list of servers the block is stored on, IList<BlockServerElement> servers = serversMap[blockId]; // Go through the servers one at a time to fetch the block, bool success = false; for (int z = 0; z < servers.Count && !success; ++z) { BlockServerElement server = servers[z]; // If the server is up, if (server.IsStatusUp) { // Open a connection with the block server, IMessageProcessor blockServerProc = connector.Connect(server.Address, ServiceType.Block); IEnumerable<Message> messageIn = blockServerProc.Process(blockServerMsg); ++NetworkCommCount; ++NetworkFetchCommCount; bool isError = false; bool severeError = false; bool crcError = false; bool connectionError = false; // Turn each none-error message into a node foreach (Message m in messageIn) { if (m.HasError) { // See if this error is a block read error. If it is, we don't // tell the manager server to lock this server out completely. bool isBlockReadError = m.Error.Source.Equals("Deveel.Data.Net.BlockReadException"); // If it's a connection fault, if (IsConnectionFailMessage(m)) { connectionError = true; } else if (!isBlockReadError) { // If it's something other than a block read error or // connection failure, we set the severe flag, severeError = true; } isError = true; } else if (isError == false) { // The reply contains the block of data read. NodeSet nodeSet = (NodeSet) m.Arguments[0].Value; DataAddress address = null; // Catch any IOExceptions (corrupt zips, etc) try { // Decode the node items into Java node objects, foreach (Node nodeItem in nodeSet) { NodeId nodeId = nodeItem.Id; address = new DataAddress(nodeId); // Wrap around a buffered DataInputStream for reading values // from the store. BinaryReader input = new BinaryReader(nodeItem.Input); short nodeType = input.ReadInt16(); ITreeNode readNode = null; if (crc32 == null) crc32 = new Crc32(); crc32.Initialize(); // Is the node type a leaf node? if (nodeType == StoreLeafType) { // Read the checksum, input.ReadInt16(); // For future use... int checksum = input.ReadInt32(); // Read the size int leafSize = input.ReadInt32(); byte[] buf = StreamUtil.AsBuffer(nodeItem.Input); if (buf == null) { buf = new byte[leafSize + 12]; ByteBuffer.WriteInt4(leafSize, buf, 8); input.Read(buf, 12, leafSize); } // Check the checksum... crc32.ComputeHash(buf, 8, leafSize + 4); int calcChecksum = (int) crc32.CrcValue; if (checksum != calcChecksum) { // If there's a CRC failure, we reject his node, log.Warning(String.Format("CRC failure on node {0} @ {1}", nodeId, server.Address)); isError = true; crcError = true; // This causes the read to retry on a different server // with this block id } else { // Create a leaf that's mapped to this data ITreeNode leaf = new MemoryTreeLeaf(nodeId, buf); readNode = leaf; } } // Is the node type a branch node? else if (nodeType == StoreBranchType) { // Read the checksum, input.ReadInt16(); // For future use... int checksum = input.ReadInt32(); // Check the checksum objects, if (checksumBuf == null) checksumBuf = new byte[8]; // Note that the entire branch is loaded into memory, int childDataSize = input.ReadInt32(); ByteBuffer.WriteInt4(childDataSize, checksumBuf, 0); crc32.ComputeHash(checksumBuf, 0, 4); long[] dataArr = new long[childDataSize]; for (int n = 0; n < childDataSize; ++n) { long item = input.ReadInt64(); ByteBuffer.WriteInt8(item, checksumBuf, 0); crc32.ComputeHash(checksumBuf, 0, 8); dataArr[n] = item; } // The calculated checksum value, int calcChecksum = (int) crc32.CrcValue; if (checksum != calcChecksum) { // If there's a CRC failure, we reject his node, log.Warning(String.Format("CRC failure on node {0} @ {1}", nodeId, server.Address)); isError = true; crcError = true; // This causes the read to retry on a different server // with this block id } else { // Create the branch node, TreeBranch branch = new TreeBranch(nodeId, dataArr, childDataSize); readNode = branch; } } else { log.Error(String.Format("Unknown node {0} type: {1}", address, nodeType)); isError = true; } // Is the node already in the list? If so we don't add it. if (readNode != null && !IsInNodeList(nodeId, nodes)) { // Put the read node in the cache and add it to the 'nodes' // list. networkCache.SetNode(address, readNode); nodes.Add(readNode); } } // while (item_iterator.hasNext()) } catch (IOException e) { // This catches compression errors, as well as any other misc // IO errors. if (address != null) { log.Error(String.Format("IO Error reading node {0}", address)); } log.Error(e.Message, e); isError = true; } } } // for (Message m : message_in) // If there was no error while reading the result, we assume the node // requests were successfully read. if (isError == false) { success = true; } else { // If this is a connection failure, we report the block failure. if (connectionError) { // If this is an error, we need to report the failure to the // manager server, ReportBlockServerFailure(server.Address); // Remove the block id from the server list cache, networkCache.RemoveServersWithBlock(blockId); } else { String failType = "General"; if (crcError) { failType = "CRC Failure"; } else if (severeError) { failType = "Exception during process"; } // Report to the first manager the block failure, so it may // investigate and hopefully correct. ReportBlockIdCorruption(server.Address, blockId, failType); // Otherwise, not a severe error (probably a corrupt block on a // server), so shuffle the server list for this block_id so next // time there's less chance of hitting this bad block. IEnumerable<BlockServerElement> srvs = networkCache.GetServersWithBlock(blockId); if (srvs != null) { List<BlockServerElement> serverList = new List<BlockServerElement>(); serverList.AddRange(srvs); CollectionsUtil.Shuffle(serverList); networkCache.SetServersForBlock(blockId, serverList, 15*60*1000); } } // We will now go retry the query on the next block server, } } } // If the nodes were not successfully read, we generate an exception, if (!success) { // Remove from the cache, networkCache.RemoveServersWithBlock(blockId); throw new ApplicationException( "Unable to fetch node from a block server" + " (block = " + blockId + ")"); } } int sz = nodes.Count; if (sz == 0) { throw new ApplicationException("Empty nodes list"); } for (int i = 0; i < sz; ++i) { ITreeNode node = nodes[i]; NodeId nodeId = node.Id; for (int n = 0; n < nids.Length; ++n) { if (nids[n].Equals(nodeId)) { resultNodes[n] = node; } } } // Check the result_nodes list is completely populated, for (int n = 0; n < resultNodes.Length; ++n) { if (resultNodes[n] == null) { throw new ApplicationException("Assertion failed: result_nodes not completely populated."); } } return resultNodes; }