예제 #1
0
            internal void RehashESRBlocksFromFailedHosts()
            {
                List<string> exchangeworkload = new List<string>();
                int minblockcount = 0;
                AbortESRBlocksFromFailedHosts(exchangeworkload, out minblockcount);

                if (badHosts.Count >= failoverShared.dc.Replication)
                {
                    throw new Exception("The number of machines removed >= Replication factor");
                }

                //if there is anything to rehash.
                if (exchangeworkload.Count > 0)
                {
                    lock (blockStatus)
                    {
                        blockStatus.Add(cID, 0);
                    }
                    FailoverInfo failover = new FailoverInfo(failoverShared.dc);
                    childFailovers.Add(cID, failover);
                    int newblockcount = minblockcount;
                    for (; ; )
                    {
                        newblockcount = AELight.NearestPrimeLE(newblockcount - 1);
                        if (newblockcount != failoverShared.dc.slave.zblocks.count)
                        {
                            break;
                        }
                    }
#if FAILOVER_DEBUG
                    Log("rehashing...newblockcount:" + newblockcount.ToString() + ";cid:" + cID.ToString());
#endif
                    failover.CreateBlocks(newblockcount, exchangeworkload, null, null, null, failoverShared, new List<string>(goodHosts.Keys).ToArray(), new List<string>(badHosts.Keys).ToArray(), true); //rehash=true
                    int childcid = cID; //!
                    System.Threading.Thread th = new System.Threading.Thread(new System.Threading.ThreadStart(delegate()
                    {
                        failover.ExecOneMapReduceFailover(this, childcid);
                    }));
                    th.IsBackground = true;
                    th.Start();
                    cID++;
                }
            }
예제 #2
0
            internal void ExecOneMapReduceFailover(FailoverInfo parent, int myCID)
            {
#if FAILOVER_DEBUG
                {
                    Log("Begin ExecOneMapReduceFailover: parent=" + (parent == null ? "null" : "has parent") + "; myCID=" + myCID.ToString());                               
                }
#endif                

                try
                {
                    if (allBlocks == null)
                    {
                        throw new Exception("FailoverInfo allBlocks is null.");
                    }

                    //Start mapblocks firstthread
                    uint sleepCnt = 0;
                    {

#if TESTFAULTTOLERANT
                        {
                            Console.WriteLine(@"FAILOVER_TEST: before map threads start");                            
                            while (System.IO.File.Exists(@"c:\temp\failovertest1.txt"))
                            {
                                Console.Write("z");
                                System.Threading.Thread.Sleep(10000);
                            }
                        }
#endif

                        foreach (MapReduceBlockInfo bl in allBlocks)
                        {
                            bl.all = workingBlocks;
                            bl.thread = new System.Threading.Thread(new System.Threading.ThreadStart(bl.firstthreadproc));
                            bl.thread.Name = "MapReduceJobBlock" + bl.BlockID + "_map";
                            bl.thread.IsBackground = true;
                            AELight_StartTraceThread(bl.thread);
                        }

                        awakeCnt = 0;

                        for (; ; )
                        {
#if FAILOVER_DEBUG
                            {
                                Log("Loop at map.  Sleepcnt=" + sleepCnt.ToString());
                                string debugtxt = "==========Blockstatus at map==========" + Environment.NewLine;
                                lock (blockStatus)
                                {
                                    foreach (KeyValuePair<int, int> pair in blockStatus)
                                    {
                                        debugtxt += "****blockcid=" + pair.Key.ToString() + ";status=" + pair.Value.ToString() + Environment.NewLine;
                                    }
                                }
                                Log(debugtxt);
                            }
#endif

                            if (AllBlocksCompleted(1))
                            {
#if FAILOVER_DEBUG
                                Log("All map blocks completed.  Breaking out of map loop...");
#endif
                                break;
                            }

                            System.Threading.Thread.Sleep(failoverShared.dc.FailoverTimeout);

                            if (sleepCnt++ > failoverShared.dc.FailoverDoCheck)
                            {
                                sleepCnt = 0;  //sleep again

#if FAILOVER_DEBUG
                                Log("Health check at map loop;awakeCnt=" + awakeCnt.ToString());
#endif

                                if (CheckHealthMap(false))
                                {
#if FAILOVER_DEBUG                                    
                                    {
                                        Log("Disk failure detected at map loop...");
                                        string debugtxt = "======Bad hosts found=======" + Environment.NewLine +
                                            string.Join(";", new List<string>(newBadHostToReason.Keys).ToArray());
                                        Log(debugtxt);
                                    }
#endif                               
                                    DisplayNewBadHosts();    

                                    foreach(string bh in newBadHostToReason.Keys)
                                    {
                                        AbortBlocksFromFailedHost(bh);
                                    }
                                    

#if FAILOVER_DEBUG
                                    {
                                        Log("Done removing all bad blocks at map loop");
                                        string debugtxt = "";
                                        Log("========failover.allblocks========");
                                        foreach (MapReduceBlockInfo bl in allBlocks)
                                        {    
                                            debugtxt += Environment.NewLine +
                                            "****blockid=" + bl.BlockID.ToString() + ";blockcid=" + bl.BlockCID.ToString()
                                            + ";dfdetected=" + bl.diskfailuredetected.ToString()
                                            + ";host=" + bl.SlaveHost + Environment.NewLine + Environment.NewLine;
                                        }
                                        Log(debugtxt);
                                    }
                                    {
                                        string debugtxt = "==========Blockstatus at map==========" + Environment.NewLine;
                                        lock (blockStatus)
                                        {
                                            foreach (KeyValuePair<int, int> pair in blockStatus)
                                            {
                                                debugtxt += "****blockcid=" + pair.Key.ToString() + ";status=" + pair.Value.ToString() + Environment.NewLine;
                                            }
                                        }
                                        Log(debugtxt);
                                        Log("hostToBlocksCount=" + hostToBlocks.Count.ToString());
                                    }
#endif
                                }
                                awakeCnt++;
                            }
                        }

#if TESTFAULTTOLERANT
                        {
                            Console.WriteLine(@"FAILOVER_TEST: after map threads joined");
                            while (System.IO.File.Exists(@"c:\temp\failovertest2.txt"))
                            {
                                Console.Write("z");
                                System.Threading.Thread.Sleep(10000);
                            }
                        }
#endif

                        if(CheckHealthMap(true))
                        {
                            DisplayNewBadHosts(); 
                            foreach(string bh in newBadHostToReason.Keys)
                            {
                                AbortBlocksFromFailedHost(bh);
                            }
                        }                        

                        for (int bi = 0; bi < allBlocks.Length; bi++)
                        {
                            MapReduceBlockInfo bl = allBlocks[bi];
                            if (!bl.diskfailuredetected)
                            {
                                AELight_JoinTraceThread(bl.thread);
                            }
                        }
#if FAILOVER_DEBUG
                        Log("All map blocks joined.");
#endif
                        if (badHosts.Count >= failoverShared.dc.Replication)
                        {
                            throw new Exception("Error: Cannot continue to exchange/sort/reduce phase.  The number of machines removed (" + (badHosts.Count).ToString() + ") is greater than or equal to replication factor (" + failoverShared.dc.Replication.ToString() + ").");
                        }

                        for (int bi = 0; bi < workingBlocks.Count; bi++)
                        {
                            MapReduceBlockInfo bl = workingBlocks[bi];
                            if (bl.diskfailuredetected || bl.blockfail)
                            {
                                bool foundgoodblock = false;
                                for (int ri = 0; ri < failoverShared.dc.Replication - 1; ri++)
                                {
                                    int nextrepblockcid = (ri + 1) * workingBlocks.Count + bl.BlockCID;
                                    MapReduceBlockInfo nextrepblock = allBlocks[nextrepblockcid];
                                    if (!nextrepblock.diskfailuredetected && !nextrepblock.blockfail)
                                    {
                                        foundgoodblock = true;
                                        workingBlocks[bi] = nextrepblock;
                                        break;
                                    }
                                }
                                if (!foundgoodblock)
                                {
                                    throw new Exception("Error: Cannot find a good replicated map block to replace the failed block.  Block index = " + bi.ToString());
                                }
                            }
                        }
#if FAILOVER_DEBUG
                        {
                            Log("=======Blocks going forward to exchange=========");
                            string debugtxt = "";
                            foreach (MapReduceBlockInfo bl in workingBlocks)
                            {
                                debugtxt += Environment.NewLine +
                                    "****blockid=" + bl.BlockID.ToString() + ";blockcid=" + bl.BlockCID.ToString() + ";host=" + bl.SlaveHost +
                                    ";dfdetected=" + bl.diskfailuredetected.ToString() + Environment.NewLine;
                            }
                            Log(debugtxt);
                        }
#endif
                    }

                    if (failoverShared.verbose)
                    {
                        Console.WriteLine((failoverShared.extraverbose ? "\r\n" : "") + "    [{0}]        Map done; starting map exchange", System.DateTime.Now.ToString(), System.DateTime.Now.Millisecond);
                        ConsoleFlush();
                    }

                    //all map joined
                    //Get good zmapblocks and their copies.
                    string[] zmapblocks = new string[blockCount];
                    {
                        foreach (MapReduceBlockInfo bl in allBlocks)
                        {
                            if (!bl.diskfailuredetected && !bl.blockfail)
                            {
                                string zm = zmapblocks[bl.BlockID];
                                if (zm != null)
                                {
                                    zm += "*";
                                }
                                else
                                {
                                    zm = "";
                                }
                                zm += Surrogate.NetworkPathForHost(bl.SlaveHost) + @"\" + bl.acl.GetZMapBlockBaseName();
                                zmapblocks[bl.BlockID] = zm;
                            }
                        }
                    }
#if FAILOVER_DEBUG
                    {
                        /*string debugtxt = "zmapblocks: len=" + zmapblocks.Length.ToString() + Environment.NewLine;
                        for (int zi = 0; zi < zmapblocks.Length; zi++)
                        {
                            debugtxt += zi.ToString() + ":" + zmapblocks[zi] + Environment.NewLine;
                        }
                        Log(debugtxt);*/
                    }
#endif

                    //assign zmapblocks workload for each working thread that is about to go into exchange.
                    for (int wi = 0; wi < workingBlocks.Count; wi++)
                    {
                        MapReduceBlockInfo wb = workingBlocks[wi];
                        wb.ownedzmapblocks = zmapblocks[wb.BlockID];

                        string remotezms = "";
                        for (int zi = 0; zi < zmapblocks.Length; zi++)
                        {
                            if (zi != wb.BlockID)
                            {
                                if (remotezms.Length > 0)
                                {
                                    remotezms += ";";
                                }
                                remotezms += zmapblocks[zi];
                            }
                        }
                        wb.remotezmapblocks = remotezms;
                    }

#if FAILOVER_DEBUG
                    {
                        /*
                        string debugtxt = "Done assign zmapblocks workload for each working thread:" + Environment.NewLine;
                        for (int wi = 0; wi < workingBlocks.Count; wi++)
                        {
                            MapReduceBlockInfo wb = workingBlocks[wi];
                            debugtxt += "blockid=" + wb.BlockID.ToString() + ";blockcid=" + wb.BlockCID.ToString() + Environment.NewLine +
                                "owned=" + wb.ownedzmapblocks + Environment.NewLine
                                + "remote=" + wb.remotezmapblocks.Split(';').Length.ToString() + Environment.NewLine;
                        }
                        Log(debugtxt);*/
                    }
#endif


                    //Start exchange/sort/reduce  
                    blockStatus.Clear();

                    for (int bi = 0; bi < workingBlocks.Count; bi++)
                    {
                        MapReduceBlockInfo bl = workingBlocks[bi];
                        blockStatus.Add(bl.BlockCID, 0);
                        string host = bl.SlaveHost.ToLower();
                        if (!hostToESRBlocks.ContainsKey(host))
                        {
                            hostToESRBlocks.Add(host, new List<MapReduceBlockInfo>());
                        }
                        hostToESRBlocks[host].Add(bl);
                    }

#if TESTFAULTTOLERANT
                    {
                        Console.WriteLine(@"FAILOVER_TEST: before esr threads start");
                        while (System.IO.File.Exists(@"c:\temp\failovertest3.txt"))
                        {
                            Console.Write("z");
                            System.Threading.Thread.Sleep(10000);
                        }
                    }
#endif

                    //start esr threads
                    for (int bi = 0; bi < workingBlocks.Count; bi++)
                    {
                        MapReduceBlockInfo bl = workingBlocks[bi];
                        bl.thread = new System.Threading.Thread(new System.Threading.ThreadStart(bl.exchangethreadproc));
                        bl.thread.Name = "MapReduceJobBlock" + bl.BlockID + "_aftermap";
                        bl.thread.IsBackground = true;
                        AELight_StartTraceThread(bl.thread);
                    }

                    sleepCnt = 0; //!
                    awakeCnt = 0; //!
                    for (; ; )
                    {

#if FAILOVER_DEBUG
                        {
                            Log("Loop at esr.  SleepCnt=" + sleepCnt.ToString());
                            string debugtxt = "==========Blockstatus at esr==========" + Environment.NewLine;
                            lock (blockStatus)
                            {
                                foreach (KeyValuePair<int, int> pair in blockStatus)
                                {
                                    debugtxt += "****blockcid=" + pair.Key.ToString() + ";status=" + pair.Value.ToString() + Environment.NewLine;
                                }
                            }
                            Log(debugtxt);
                        }
#endif

                        if (AllBlocksCompleted(1))
                        {
#if FAILOVER_DEBUG
                            Log("All esr blocks completed.");
#endif

#if TESTFAULTTOLERANT
                            {
                                Console.WriteLine(@"FAILOVER_TEST: all esr blocks completed, before breaking out of loop");                               
                                while (System.IO.File.Exists(@"c:\temp\failovertest4.txt"))
                                {
                                    Console.Write("z");
                                    System.Threading.Thread.Sleep(10000);
                                }
                            }
#endif

                            //do one more check before breaking out of loop.
                            if(CheckHealthESR(true))
                            {

#if FAILOVER_DEBUG
                                Log("df detected before breaking out of esr loop");
#endif

                                sleepCnt = 0;
                                DisplayNewBadHosts();                                
                                RehashESRBlocksFromFailedHosts();
                            }
                            else
                            {
                                break;
                            }
                        }

                        System.Threading.Thread.Sleep(failoverShared.dc.FailoverTimeout);

                        if (sleepCnt++ > failoverShared.dc.FailoverDoCheck)
                        {
                            sleepCnt = 0;  //sleep again

#if FAILOVER_DEBUG
                            Log("Health check at esr loop;awakeCnt=" + awakeCnt.ToString());
#endif

                            if (CheckHealthESR(false))
                            {
                                DisplayNewBadHosts();
                                RehashESRBlocksFromFailedHosts();
                            }
                            awakeCnt++;
                        }
                    }
                    
#if FAILOVER_DEBUG
                    Log("all esr joined...");
#endif
                    //check my good workingblocks
                    {
                        Exception ee = null;
                        foreach (MapReduceBlockInfo bl in workingBlocks)
                        {
                            if (!bl.diskfailuredetected)  //still good.
                            {
                                AELight_JoinTraceThread(bl.thread);
                                if (bl.blockfail)
                                {
                                    ee = bl.LastThreadException;
                                }
                            }
                        }
                        if (null != ee)
                        {
                            throw new Exception("ESR workingblock error: " + ee.ToString());
                        }
                    }

                    //check child failover
                    foreach (FailoverInfo failover in childFailovers.Values)
                    {
                        if (failover.LastException != null)
                        {
                            throw new Exception("childFailovers.count=" + childFailovers.Count.ToString() + ";Child failover error: " + failover.LastException.ToString());
                        }
                    }

#if FAILOVER_DEBUG
                    Log("no esr exceptions...");
#endif

                    //ALL DONE.  Append blocks to parent only if everything is ok.
                    if (parent != null)
                    {
#if FAILOVER_DEBUG
                        Log("Adding my esrblocks to parent:");
                        {
                            string debugtxt = "my esrblocks:" + Environment.NewLine;
                            foreach (KeyValuePair<string, List<MapReduceBlockInfo>> pair in hostToESRBlocks)
                            {
                                debugtxt += pair.Key + ":" + pair.Value.Count.ToString() + Environment.NewLine;
                            }
                            Log(debugtxt);
                        }
#endif
                        parent.AddHostToESRBlocks(hostToESRBlocks);
                    }
                }
                catch (Exception e)
                {
                    LastException = e;

                    try
                    {
                        CloseAllBlocks();
                    }
                    catch
                    {
                    }
                    
                    LogOutput("ExecOneMapReduceFailover error: " + e.ToString());

#if FAILOVER_DEBUG
                    Log("ExecOneMapReduceFailover error: " + e.ToString());
#endif
                   
                }

                if (parent != null) //report that i am done no matter if there is exception or not.
                {
#if FAILOVER_DEBUG
                    Log("UpdateBlockStatus mycID: " + myCID.ToString());
#endif
                    parent.UpdateBlockStatus(myCID, 1);
                }

#if FAILOVER_DEBUG
                {
                    string debugtxt = "Exiting ExecOneMapReduceFailover...Final esrblocks:" + Environment.NewLine;
                    foreach (KeyValuePair<string, List<MapReduceBlockInfo>> pair in hostToESRBlocks)
                    {
                        debugtxt += "host=" + pair.Key + ":" + pair.Value.Count.ToString() + Environment.NewLine;
                        foreach (MapReduceBlockInfo bl in pair.Value)
                        {
                            debugtxt += "blockid=" + bl.BlockID.ToString() + ";blockcid=" + bl.BlockCID.ToString() + Environment.NewLine +
                                "owned=" + bl.ownedzmapblocks + Environment.NewLine;
                        }
                    }
                    Log(debugtxt);
                }
#endif
            }