public static void Parse(Dictionary <string, LangMatrixRow> res) { var googleLocsCodes = File.ReadAllLines(Directory.GetCurrentDirectory() + @"\google\googleTrans.txt"). Select(l => l.Split('\t')). Select(p => p[1].Split(' ')[0].Replace("**", "")). ToArray(); var googleLocs = googleLocsCodes. Select(w => LocaleIdentifier.Parse(w).MostLikelySubtags()). ToArray(); var oks = googleLocs. Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? cl : null). NotNulls(). ToArray(); var wrongs = googleLocs. Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? null : loc.ToString()). NotNulls(). ToArray(); if (googleLocsCodes.Length != oks.Length) { throw new Exception(); } oks.ForEach((item, idx) => { var row = LangsDesignLib.adjustNewfulltextDataRow(res, item.Id.ToString()); row.row[7] = googleLocsCodes[idx]; }); }
public static void Build() { var langs = Corpus.DownloadWikies.getUrls().Where(u => u.size > /*1000000*/ 0).Select(u => u.name.Split(new string[] { "wi" }, StringSplitOptions.RemoveEmptyEntries)[0]).Distinct().ToArray(); var lmLangs = Langs.meta.Select(l => l.Lang).Distinct().ToArray(); var notInWiki = lmLangs.Except(langs).ToArray(); var validLangs = langs.Where(l => LocaleIdentifier.TryParse(l, out LocaleIdentifier li)).ToArray(); var wikiLocs = validLangs.Select(l => LocaleIdentifier.Parse(l).MostLikelySubtags().ToString()).ToArray(); var oks = wikiLocs. Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? cl : null). NotNulls(). ToArray(); var wrongs = wikiLocs. Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? null : loc). NotNulls(). ToArray(); //ALPHAs // from clibs\utils\unicode\unicodeBlocks.json //Armi (http://zuga.net/articles/unicode/script/imperial-aramaic/) and Goth (? https://en.wikipedia.org/wiki/Gothic_alphabet) missing var alphas = new HashSet <String> { "Latn", "Zyyy", "Grek", "Copt", "Cyrl", "Armn", "Hebr", "Arab", "Syrc", "Thaa", "Nkoo", "Samr", "Mand", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu", "Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor", "Hang", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Tglg", "Hano", "Buhd", "Tagb", "Khmr", "Mong", "Limb", "Tale", "Talu", "Bugi", "Lana", "Bali", "Sund", "Batk", "Lepc", "Olck", "Glag", "Tfng", "Hira", "Kana", "Bopo", "Hani", "Yiii", "Lisu", "Vaii", "Bamu", "Sylo", "Phag", "Saur", "Kali", "Rjng", "Java", "Cham", "Tavt", "Mtei" }; var wrongAlphas = wrongs.Where(l => !alphas.Contains(LocaleIdentifier.Parse(l).Script)).ToArray(); //var path = LangsDesignDirs.cldrRepo; }
public static IEnumerable <LangMatrixRow> fromNetCultureInfos(LocaleIdentifier[] cldrSpecifics) { // get NON cldr culture data var cldrs = new HashSet <string>(cldrSpecifics.Select(c => c.Language)); return(CultureInfo.GetCultures(CultureTypes.AllCultures). Select(cu => { LocaleIdentifier lid = null; { try { lid = LocaleIdentifier.Parse(cu.Name); } catch { } } if ( lid == null || string.IsNullOrEmpty(lid.Region) || char.IsDigit(lid.Region[0]) || cldrs.Contains(lid.Language) ) { return null; } var res = CldrUtils.getNetRowData(cu, lid.Language, lid.Region, out LocaleIdentifier locId); return new LangMatrixRow { lang = locId.ToString(), row = res, }; }). Where(lt => lt != null)); }
public void checkTexts(Dictionary <string, Dictionary <string, string> > protocol) { var locId = LocaleIdentifier.Parse(lang); var wrongs = UnicodeBlocks.checkBlockNames(row, locId.Script); if (wrongs == null) { return; } protocol[lang] = wrongs; }
static string[] getNetRowData(CultureInfo lc, string Language, string Region, out LocaleIdentifier locId) { var values = new string[count]; var fmt = lc.DateTimeFormat; fmt.MonthNames.Take(12).ToArray(values, monthsIdx); fmt.MonthGenitiveNames.Take(12).ToArray(values, smonthsIdx); fmt.DayNames.Take(7).ToArray(values, daysIdx); var script = UnicodeBlocks.getBlockNames(values).Select(kv => kv.Key).Single(); locId = LocaleIdentifier.Parse(string.Format("{0}-{1}-{2}", Language, script, Region)); return(values); }
public static Langs.CldrLang[] getMissingLangs() { var wrongs = new string[] { "ceb", "ht", "hmn", "la", "ny", "sm", "su", Langs.invariantId.Split('-')[0] }; var wrongsEx = wrongs.Select(w => LocaleIdentifier.Parse(w).MostLikelySubtags()).ToArray(); var newLangs = wrongsEx. Select(l => new Langs.CldrLang { Id = string.Format("{0}-{1}", l.Language, l.Region), Lang = l.Language, ScriptId = l.Script, DefaultRegion = l.Region, Regions = new string[] { l.Region } }). ToArray(); //var newLangsStr = Json.SerializeStr(newLangs); return(newLangs); }
public static void Build() { var cldr = Json.Deserialize <Langs.CldrLang[]>(LangsDirs.dirCldrTexts); var roots = cldr.SelectMany(c => c.Regions.Select(r => LocaleIdentifier.Parse(string.Format("{0}-{1}-{2}", c.Lang, c.ScriptId, r)))).ToArray(); var locs = roots.Select(root => new Locale(root)).ToArray(); // ALPHABETS var rx = new Regex("[{} ]"); var alphas = locs.Select(loc => { var localePattern = loc.Find("//characters"); var data = localePattern.SelectChildren(XPathNodeType.Element).OfType <XPathNavigator>().Where(al => al.Name == "exemplarCharacters").Select(al => { var value = CldrUtils.decodeUnicodeLiteral(al.Value.Normalize().Trim('[', ']')); value = rx.Replace(value, ""); var keyNode = al.SelectSingleNode("./@type"); var key = keyNode == null ? "root" : keyNode.Value; //if (key == "numbers" || key == "punctuation" || string.IsNullOrEmpty(value)) return null; return(new { key, value }); }).Where(n => n != null).ToArray(); return(new { lang = loc.Id.ToString(), data }); }).Where(d => d.data.Length > 0).OrderBy(l => l.lang).ToArray(); Func <string, IEnumerable <LangMatrixRow> > alphasRes = (string key) => alphas.SelectMany(a => a.data.Where(aa => aa.key == key).Select(aa => new LangMatrixRow { lang = a.lang, columnNames = Linq.Items(aa.key).ToArray(), row = Linq.Items(aa.value).ToArray() })); var alphaRoot = new LangMatrix(alphasRes("root"), null, true); var alphaAuxlity = new LangMatrix(alphasRes("auxiliary"), null, true); var alphaIndex = new LangMatrix(alphasRes("index"), null, true); var alphaNumbers = new LangMatrix(alphasRes("numbers"), null, true); var alphaPunctuation = new LangMatrix(alphasRes("punctuation"), null, true); var patterns = new LangMatrix(locs.Select(loc => { var localePattern = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localePattern").ToString(); var localeSeparator = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localeSeparator").ToString(); return(new LangMatrixRow { lang = loc.Id.ToString(), row = new string[] { localePattern, localeSeparator }, columnNames = new string[] { "pattern", "separator" }, }); }), null, true); Dictionary <string, Dictionary <string, string> > langsProtocol = new Dictionary <string, Dictionary <string, string> >(); var langs = new LangMatrix( locs.Select(loc => fromCldr(loc, "//localeDisplayNames/languages")), langsProtocol, true ); Dictionary <string, Dictionary <string, string> > scriptsProtocol = new Dictionary <string, Dictionary <string, string> >(); var scripts = new LangMatrix( locs.Select(loc => fromCldr(loc, "//localeDisplayNames/scripts")), scriptsProtocol, true ); Dictionary <string, Dictionary <string, string> > regionsProtocol = new Dictionary <string, Dictionary <string, string> >(); var regions = new LangMatrix( locs.Select(loc => fromCldr(loc, "//localeDisplayNames/territories")), regionsProtocol, true ); langs.save(LangsDesignDirs.cldr + "cldrNameLangs.csv", true); scripts.save(LangsDesignDirs.cldr + "cldrNameScripts.csv", true); regions.save(LangsDesignDirs.cldr + "cldrNameRegions.csv", true); patterns.save(LangsDesignDirs.cldr + "cldrNamePatterns.csv", true); alphaRoot.save(LangsDesignDirs.cldr + "alphaRoot.csv", true); alphaAuxlity.save(LangsDesignDirs.cldr + "alphaAuxlity.csv", true); alphaIndex.save(LangsDesignDirs.cldr + "alphaIndex.csv", true); alphaNumbers.save(LangsDesignDirs.cldr + "alphaNumbers.csv", true); alphaPunctuation.save(LangsDesignDirs.cldr + "alphaPunctuation.csv", true); // save to DART messages File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNameLangs.msg", Protobuf.ToBytes(matrixToDart(langs))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNameScripts.msg", Protobuf.ToBytes(matrixToDart(scripts))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNameRegions.msg", Protobuf.ToBytes(matrixToDart(regions))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNamePatterns.msg", Protobuf.ToBytes(matrixToDart(patterns))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaRoot.msg", Protobuf.ToBytes(matrixToDart(alphaRoot))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaAuxlity.msg", Protobuf.ToBytes(matrixToDart(alphaAuxlity))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaIndex.msg", Protobuf.ToBytes(matrixToDart(alphaIndex))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaNumbers.msg", Protobuf.ToBytes(matrixToDart(alphaNumbers))); File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaPunctuation.msg", Protobuf.ToBytes(matrixToDart(alphaPunctuation))); // var localePattern = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localePattern").ToString(); // var localeSeparator = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localeSeparator").ToString(); // fromCldr(loc, "//localeDisplayNames/languages"); // fromCldr(loc, "//localeDisplayNames/scripts"); // fromCldr(loc, "//localeDisplayNames/territories"); //}); }
static void computeLangRegionEnglish(EnglishNames english) { var scripts = extractScript().ToDictionary( s => s.lang, s => s.script + (s.other == null ? "" : "|" + string.Join(",", s.other))); // ************************** // ***** REGION-LANG table: <region,lang>,<population,isOfficial,script> var regionLang = supplements.Descendants("territoryInfo"). Single(). Elements("territory"). SelectMany(el => { var allPop = int.Parse(el.Attribute("population").Value); var region = el.Attribute("type").Value; return(el.Elements("languagePopulation"). Select(ee => { string lang = ee.Attribute("type").Value, key1 = $"{lang}|{region}", key2 = $"{lang}|"; return new regLang { region = region, lang = lang, population = (int)Math.Round((double)allPop * float.Parse(ee.Attribute("populationPercent")?.Value ?? "0", CultureInfo.InvariantCulture) / 100), isOfficial = ee.Attribute("officialStatus") != null, }; })); }). ToArray(); // ************************** // ***** population grouping // group and sum by <region> or <lang> void groupAndSum(regLang[] data, bool byReg, bool noOffOnly = false) { string friendlyName(regLang d, bool isKey, bool sh = false) { var actId = byReg == isKey ? d.region : d.lang; if (sh) { return(actId); } var actNames = byReg == isKey ? english.Teritories : english.Langs; return(actNames[actId]); } string stringDetail(IGrouping <string, regLang> g, bool officialOnly) { var sd = string.Join(", ", g.Where(l => officialOnly == l.isOfficial && l.population > 0). OrderByDescending(l => l.population). Select(l => $"{friendlyName(l, false, false)}:{friendlyName(l, false, true)}:{l.population}")); return(sd == "" ? null : sd); } var by = byReg ? data.GroupBy(d => d.region) : data.GroupBy(d => d.lang); var res = by.Select(g => new LangOrRegion { name = friendlyName(g.First(), true), id = g.Key, offSum = g.Where(l => l.isOfficial).Sum(l => l.population), nonOffSum = g.Where(l => !l.isOfficial).Sum(l => l.population), off = stringDetail(g, true), nonOff = stringDetail(g, false), likely = byReg ? null : LocaleIdentifier.Parse(g.Key).MostLikelySubtags().ToString(), scripts = byReg ? null : scripts[g.Key], }). Where(r => r.id != "und" && (noOffOnly ? r.offSum == 0 : r.offSum + r.nonOffSum > 0)). OrderByDescending(r => r.offSum + r.nonOffSum). ToArray(); var nm = byReg ? "Regions" : (noOffOnly ? "LangsNotOfficial" : "Langs"); Json.Serialize(LangsDesignDirs.root + $"patches\\cldr{nm}.json", res); } groupAndSum(regionLang, true); groupAndSum(regionLang, false, true); groupAndSum(regionLang, false); }
static List <LangScripts> extractScript() { var validScrits = UnicodeBlocks.ISO15924.ToHashSet(); validScrits.Add("Jpan"); validScrits.Add("Kore"); validScrits.Add("Hans"); validScrits.Add("Hant"); var res = new List <LangScripts>(); foreach (var row in XElement.Load(Directory.GetCurrentDirectory() + @"\cldr\langScripts.xml").Descendants("tr")) { // last TD with value.length==4 is SCRIPT var script = row.Elements().Select(el => el.Value.Trim()).Where(s => s.Length == 4).Last(); // first TR with seven TD child defines LANG if (row.Elements().Count() == 7) { var s = new LangScripts { lang = row.Descendants("a").Single().Attribute("name").Value }; res.Add(s); } // obsolete script? (info in last node) var lastNodeValue = (row.LastNode as XElement).Value.Trim(); var obsolete = !validScrits.Contains(script) || lastNodeValue == "N" || lastNodeValue == "n/a"; // if (!obsolete) { res.Last().other.Add(script); } } res = res.Where(l => allLangs.Contains(l.lang)).ToList(); foreach (var s in res) { // no script found => add "2?" + likely var likely = LocaleIdentifier.Parse(s.lang).MostLikelySubtags(); if (s.other.Count == 0) { s.script = "2?" + likely.Script; s.other = null; continue; } // likely is not in found script => add "1?" + likely s.script = s.other.Contains(likely.Script) ? likely.Script : "1?" + likely.Script; s.other.Remove(likely.Script); if (s.other.Count == 0) { s.other = null; } } // add missing langs: add script from lang (e.g. Arab for uz_Arab) OR "3?" + likely foreach (var lang in allLangs.Except(res.Select(l => l.lang))) { res.Add(new LangScripts { lang = lang, script = lang.Contains('_') ? lang.Split('_')[1] : "3?" + LocaleIdentifier.Parse(lang).MostLikelySubtags().Script, other = null }); } foreach (var lang in res) { lang.name = langTrans[lang.lang]; } // sort, write and return res = res.OrderByDescending(s => s.other == null ? 0 : s.other.Count).ThenBy(s => s.lang).ToList(); Json.Serialize(LangsDesignDirs.root + $"patches\\cldrScript.json", res); return(res); }