static void Reduce(this IDictionary <string, Bag> lexicon, System.Language.IOrthography lang, int weight, int limit) { var reduce = new HashSet <string>(); var depends = new HashSet <string>(); Parallel.ForEach(lexicon, (bag) => { List <Tuple <String, Int32, Int32> > SORT = new List <Tuple <String, Int32, Int32> >(); // A single bag should never be worked on concurrently bag.Value.ForEach((key, count) => { Bag lex = null; if (!lexicon.TryGetValue(key, out lex)) { lex = null; } if (lex != null && lex.Weight >= weight) { SORT.Add(new Tuple <String, Int32, Int32>(key, count, lex.Weight)); } }); SORT.Sort((a, b) => { int c = 0; if (a.Item2 > b.Item2) { c = -1; } else if (a.Item2 < b.Item2) { c = +1; } if (c == 0) { c = lang.Compare(a.Item1, b.Item1); } return(c); }); List <Tuple <String, Int32, Int32> > TAKE = new List <Tuple <String, Int32, Int32> >(); for (int i = 0; i < SORT.Count; i++) { if (TAKE.Count >= limit) { break; } TAKE.Add(SORT[i]); } bag.Value.Clear(); for (int i = 0; i < TAKE.Count; i++) { string key = TAKE[i].Item1; bag.Value.Add(key, TAKE[i].Item2); lock (depends) { depends.Add(key); } } if (bag.Value.Weight < weight) { lock (reduce) { reduce.Add(bag.Value.Key); } } }); foreach (var key in reduce) { if (!depends.Contains(key)) { lexicon.Remove(key); } } }
static IDictionary <string, Bag> Build(int WINDOW, System.Language.IOrthography lang, Func <string, string> support, ISet <string> ignore, string[] paths, string search = "*.*") { Dictionary <String, Bag> lexicon = new Dictionary <String, Bag>(); Tokens.Parse(paths, search, (TOKEN, EMIT) => { if (TOKEN.Length > 1) { if (char.ToUpperInvariant(TOKEN[1]) != TOKEN[1]) { TOKEN = TOKEN.ToLowerInvariant(); } } /* */ string s = lang.Convert(TOKEN); if (!lang.IsLegible(s)) { return; } /* Do not take single letter entries unless they start with upper case */ if (s.Length == 1) { if (char.ToUpperInvariant(s[0]) != s[0]) { return; } } /* Do not take roman numerals */ if (s.Length > 1 && char.ToUpperInvariant(s[0]) != s[0]) { bool same = true; for (int i = 1; i < s.Length; i++) { if (char.ToUpperInvariant(s[i]) != char.ToUpperInvariant(s[i - 1])) { same = false; break; } } if (s != "ui" && s != "uim" && s != "uix" && s != "uii" && s != "lux" && s != "lum" && s != "cum" && s != "cui" && s != "mum" && s != "mi" && s != "id" && s != "mix" && s != "diu" && s != "dix" && s != "di" && s != "dii" && s != "dux" && s != "dum") { var n = RomanToInteger(s); if (n.HasValue) { var c = IntegerToRoman(n.Value); if (c == s) { return; } } } if (same) { return; } } /* Must have at least one vowel * * - Abbreviates should be capitalized . * - Foreign words might be ignored which is a good side effect. * */ if (s.Length > 1 && char.ToUpperInvariant(s[0]) != s[0]) { int vowels = 0; for (int i = 0; i < s.Length; i++) { switch (s[i]) { case 'a': case 'e': case 'i': case 'o': case 'u': vowels++; break; } } if (vowels <= 0) { return; } } if (s.EndsWith("que") && s.Length > "que".Length) { s = s.Substring(0, s.Length - "que".Length); } if (s.EndsWith("QVE") && s.Length > "QVE".Length) { s = s.Substring(0, s.Length - "QVE".Length); } if (ignore.Contains(s)) { return; } if (support != null) { s = lang.Convert(support(s)); if (!lang.IsLegible(s)) { return; } } if (EMIT != null) { EMIT(s); } }, (FILE, DOC) => { Log(Path.GetFullPath(FILE)); var bags = Bags.Compute(DOC, WINDOW, (FOCUS, NEIGHBOR, Δ) => { if (FOCUS[0] == char.ToUpperInvariant(FOCUS[0])) { if (NEIGHBOR[0] != char.ToUpperInvariant(NEIGHBOR[0])) { return(false); } } else if (NEIGHBOR[0] == char.ToUpperInvariant(NEIGHBOR[0])) { if (FOCUS[0] != char.ToUpperInvariant(FOCUS[0])) { return(false); } } return(true); }); foreach (var bag in bags) { if (lexicon != null) { lock (lexicon) { Bag lex; string key = bag.Key; if (!lexicon.TryGetValue(key, out lex)) { lexicon[key] = lex = new Bag(key, lexicon.Count); } lex.Add(bag, bag.Weight); } } } }); return(lexicon); }