Example #1
0
        public static Tuple <string, string> Finalize(ReduceObject ro)
        {
            var BlackWords    = string.Join("\n", ro.Tokeniz.OrderByDescending(c => c.Value).Select(c => $"词语:{c.Key};词频:{c.Value}"));
            var resultContent = string.Format(f**k,
                                              DateTime.Now.ToString("R"),
                                              string.Join("\n", deepBlackList.Select(c => $"||{c}")) + google,
                                              string.Join("\n", ro.Result));

            resultContent = resultContent.Replace("\r", "").Replace("[AutoProxy]", $"[AutoProxy]\n! Checksum: {CalcSum(resultContent)}");
            return(new Tuple <string, string>(BlackWords, resultContent));
        }
Example #2
0
        public static ReduceObject Reduce(MapObject mo, int?limit = null)
        {
            var result = new ReduceObject();

            Console.WriteLine("Merge");
            result.Result.AddRange(mo.pureIPAddress);
            result.Result.AddRange(mo.pureHost.Select(c => c.StartsWith("www.") ? $"||{c.Substring(4, c.Length - 4)}" : $"||{c}"));
            foreach (var item in mo.urlWithHttps)
            {
                if (mo.pureHost.All(c => c != item))
                {
                    result.Result.Add("|https://" + item);
                }
            }
            Console.WriteLine("Take BlackWords");
            var tokeniz = new ConcurrentDictionary <string, int>();

            Parallel.ForEach(mo.urlWithQuery, new ParallelOptions {
                MaxDegreeOfParallelism = limit ?? Environment.ProcessorCount
            }, (query, loopState) =>
            {
                var dec = query.UrlDecode();
                if (string.IsNullOrWhiteSpace(dec))
                {
                    return;
                }
                if (dec.Any(c => c > 127))
                {//with chinese path
                    try
                    {
                        var r = Tokeniz(query, System.Threading.Thread.CurrentThread.ManagedThreadId);
                        foreach (var t in r.tokens.Where(c => c.type == "CN_WORD" || c.type == "ENGLISH"))
                        {
                            tokeniz.AddOrUpdate(t.token, 1, (id, count) => count + 1);
                        }
                    }
                    catch
                    {
                        //TODO excuseme?
                    }
                }
                else
                {
                    //var spl = dec.Split(' ');
                    //foreach (var s in spl)
                    //{
                    //    result.Tokeniz.AddOrUpdate(s, 1, (id, count) => count + 1);
                    //}
                }
            });
            result.Tokeniz = new Dictionary <string, int>(tokeniz);
            return(result);
        }
Example #3
0
        ReduceObject Reduce(MapObject mo)
        {
            var result = new ReduceObject();

            //mo.Save();
            Console.WriteLine("Merge");
            var domains  = new ConcurrentDictionary <string, int>();
            var badwords = new ConcurrentDictionary <string, int>();

            mo.pureDomain.AsParallel().ForAll(d =>
            {
                domains.AddOrUpdate(d, 1, IncBy1);
            });
            mo.url.AsParallel().ForAll(url =>
            {
                var fidx   = url.IndexOf('/');
                var domain = url.Substring(0, fidx);
                var query  = url.Substring(fidx + 1, url.Length - fidx - 1);
                if (query.Any(c => c > 127))
                {
                    var r = Tokeniz(query);
                    foreach (var t in r.Where(c => c.type == PanGu.WordType.SimplifiedChinese || c.type == PanGu.WordType.TraditionalChinese))
                    {
                        badwords.AddOrUpdate(t.token, 1, IncBy1);
                    }
                }
                else
                {
                    domains.AddOrUpdate(domain, 1, IncBy1);
                }
            });
            result.BadWords = badwords.OrderByDescending(c => c.Value).ToArray();

            var final = new ConcurrentDictionary <string, int>();
            var uc    = new ConcurrentDictionary <string, int>();

            domains.AsParallel().ForAll(c =>
            {
                if (c.Value == 1)
                {
                    var spl = c.Key.Split('.');
                    if (spl.Length > 2)
                    {
                        uc.AddOrUpdate(c.Key.Substring(spl[0].Length, c.Key.Length - spl[0].Length), 1, IncBy1);
                    }
                }
            });
            domains.AsParallel().ForAll(c =>
            {
                if (c.Key.EndsWith(".cn"))
                {
                    return;
                }
                if (c.Value > 1)
                {
                    final.AddOrUpdate(c.Key.TrimStart('w', 'W'), c.Value, (s, b) => b + c.Value);
                }
                else
                {
                    var spl = c.Key.Split('.');
                    var pad = c.Key.Substring(spl[0].Length, c.Key.Length - spl[0].Length);
                    if (spl.Length == 2)
                    {
                        final.AddOrUpdate(c.Key, c.Value, (s, b) => b + c.Value);
                    }
                    else if (uc.TryGetValue(pad, out int value) && value > 30)
                    {
                        final.AddOrUpdate(pad, value, IncBy1);
                    }
                    else
                    {
                        final.AddOrUpdate(c.Key.TrimStart('w'), c.Value, (s, b) => b + c.Value);
                    }
                }
            });
            result.Result = final.ToArray();
            return(result);

            int IncBy1(string inputString, int inc) => inc + 1;
        }