public static Tuple <string, string> Finalize(ReduceObject ro) { var BlackWords = string.Join("\n", ro.Tokeniz.OrderByDescending(c => c.Value).Select(c => $"词语:{c.Key};词频:{c.Value}")); var resultContent = string.Format(f**k, DateTime.Now.ToString("R"), string.Join("\n", deepBlackList.Select(c => $"||{c}")) + google, string.Join("\n", ro.Result)); resultContent = resultContent.Replace("\r", "").Replace("[AutoProxy]", $"[AutoProxy]\n! Checksum: {CalcSum(resultContent)}"); return(new Tuple <string, string>(BlackWords, resultContent)); }
public static ReduceObject Reduce(MapObject mo, int?limit = null) { var result = new ReduceObject(); Console.WriteLine("Merge"); result.Result.AddRange(mo.pureIPAddress); result.Result.AddRange(mo.pureHost.Select(c => c.StartsWith("www.") ? $"||{c.Substring(4, c.Length - 4)}" : $"||{c}")); foreach (var item in mo.urlWithHttps) { if (mo.pureHost.All(c => c != item)) { result.Result.Add("|https://" + item); } } Console.WriteLine("Take BlackWords"); var tokeniz = new ConcurrentDictionary <string, int>(); Parallel.ForEach(mo.urlWithQuery, new ParallelOptions { MaxDegreeOfParallelism = limit ?? Environment.ProcessorCount }, (query, loopState) => { var dec = query.UrlDecode(); if (string.IsNullOrWhiteSpace(dec)) { return; } if (dec.Any(c => c > 127)) {//with chinese path try { var r = Tokeniz(query, System.Threading.Thread.CurrentThread.ManagedThreadId); foreach (var t in r.tokens.Where(c => c.type == "CN_WORD" || c.type == "ENGLISH")) { tokeniz.AddOrUpdate(t.token, 1, (id, count) => count + 1); } } catch { //TODO excuseme? } } else { //var spl = dec.Split(' '); //foreach (var s in spl) //{ // result.Tokeniz.AddOrUpdate(s, 1, (id, count) => count + 1); //} } }); result.Tokeniz = new Dictionary <string, int>(tokeniz); return(result); }
ReduceObject Reduce(MapObject mo) { var result = new ReduceObject(); //mo.Save(); Console.WriteLine("Merge"); var domains = new ConcurrentDictionary <string, int>(); var badwords = new ConcurrentDictionary <string, int>(); mo.pureDomain.AsParallel().ForAll(d => { domains.AddOrUpdate(d, 1, IncBy1); }); mo.url.AsParallel().ForAll(url => { var fidx = url.IndexOf('/'); var domain = url.Substring(0, fidx); var query = url.Substring(fidx + 1, url.Length - fidx - 1); if (query.Any(c => c > 127)) { var r = Tokeniz(query); foreach (var t in r.Where(c => c.type == PanGu.WordType.SimplifiedChinese || c.type == PanGu.WordType.TraditionalChinese)) { badwords.AddOrUpdate(t.token, 1, IncBy1); } } else { domains.AddOrUpdate(domain, 1, IncBy1); } }); result.BadWords = badwords.OrderByDescending(c => c.Value).ToArray(); var final = new ConcurrentDictionary <string, int>(); var uc = new ConcurrentDictionary <string, int>(); domains.AsParallel().ForAll(c => { if (c.Value == 1) { var spl = c.Key.Split('.'); if (spl.Length > 2) { uc.AddOrUpdate(c.Key.Substring(spl[0].Length, c.Key.Length - spl[0].Length), 1, IncBy1); } } }); domains.AsParallel().ForAll(c => { if (c.Key.EndsWith(".cn")) { return; } if (c.Value > 1) { final.AddOrUpdate(c.Key.TrimStart('w', 'W'), c.Value, (s, b) => b + c.Value); } else { var spl = c.Key.Split('.'); var pad = c.Key.Substring(spl[0].Length, c.Key.Length - spl[0].Length); if (spl.Length == 2) { final.AddOrUpdate(c.Key, c.Value, (s, b) => b + c.Value); } else if (uc.TryGetValue(pad, out int value) && value > 30) { final.AddOrUpdate(pad, value, IncBy1); } else { final.AddOrUpdate(c.Key.TrimStart('w'), c.Value, (s, b) => b + c.Value); } } }); result.Result = final.ToArray(); return(result); int IncBy1(string inputString, int inc) => inc + 1; }