Ejemplo n.º 1
0
    public static void Parse(Dictionary <string, LangMatrixRow> res)
    {
        var googleLocsCodes = File.ReadAllLines(Directory.GetCurrentDirectory() + @"\google\googleTrans.txt").
                              Select(l => l.Split('\t')).
                              Select(p => p[1].Split(' ')[0].Replace("**", "")).
                              ToArray();
        var googleLocs = googleLocsCodes.
                         Select(w => LocaleIdentifier.Parse(w).MostLikelySubtags()).
                         ToArray();
        var oks = googleLocs.
                  Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? cl : null).
                  NotNulls().
                  ToArray();
        var wrongs = googleLocs.
                     Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? null : loc.ToString()).
                     NotNulls().
                     ToArray();

        if (googleLocsCodes.Length != oks.Length)
        {
            throw new Exception();
        }
        oks.ForEach((item, idx) => {
            var row    = LangsDesignLib.adjustNewfulltextDataRow(res, item.Id.ToString());
            row.row[7] = googleLocsCodes[idx];
        });
    }
Ejemplo n.º 2
0
    public static void Build()
    {
        var langs      = Corpus.DownloadWikies.getUrls().Where(u => u.size > /*1000000*/ 0).Select(u => u.name.Split(new string[] { "wi" }, StringSplitOptions.RemoveEmptyEntries)[0]).Distinct().ToArray();
        var lmLangs    = Langs.meta.Select(l => l.Lang).Distinct().ToArray();
        var notInWiki  = lmLangs.Except(langs).ToArray();
        var validLangs = langs.Where(l => LocaleIdentifier.TryParse(l, out LocaleIdentifier li)).ToArray();
        var wikiLocs   = validLangs.Select(l => LocaleIdentifier.Parse(l).MostLikelySubtags().ToString()).ToArray();
        var oks        = wikiLocs.
                         Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? cl : null).
                         NotNulls().
                         ToArray();
        var wrongs = wikiLocs.
                     Select(loc => Langs.fullNameToMeta.TryGetValue(loc.ToString(), out Langs.CldrLang cl) ? null : loc).
                     NotNulls().
                     ToArray();
        //ALPHAs
        // from clibs\utils\unicode\unicodeBlocks.json
        //Armi (http://zuga.net/articles/unicode/script/imperial-aramaic/) and Goth (? https://en.wikipedia.org/wiki/Gothic_alphabet) missing
        var alphas = new HashSet <String> {
            "Latn", "Zyyy", "Grek", "Copt", "Cyrl", "Armn", "Hebr", "Arab", "Syrc", "Thaa", "Nkoo", "Samr", "Mand", "Deva", "Beng", "Guru", "Gujr", "Orya", "Taml", "Telu", "Knda", "Mlym", "Sinh", "Thai", "Laoo", "Tibt", "Mymr", "Geor", "Hang", "Ethi", "Cher", "Cans", "Ogam", "Runr", "Tglg", "Hano", "Buhd", "Tagb", "Khmr", "Mong", "Limb", "Tale", "Talu", "Bugi", "Lana", "Bali", "Sund", "Batk", "Lepc", "Olck", "Glag", "Tfng", "Hira", "Kana", "Bopo", "Hani", "Yiii", "Lisu", "Vaii", "Bamu", "Sylo", "Phag", "Saur", "Kali", "Rjng", "Java", "Cham", "Tavt", "Mtei"
        };
        var wrongAlphas = wrongs.Where(l => !alphas.Contains(LocaleIdentifier.Parse(l).Script)).ToArray();

        //var path = LangsDesignDirs.cldrRepo;
    }
Ejemplo n.º 3
0
    public static IEnumerable <LangMatrixRow> fromNetCultureInfos(LocaleIdentifier[] cldrSpecifics)
    {
        // get NON cldr culture data
        var cldrs = new HashSet <string>(cldrSpecifics.Select(c => c.Language));

        return(CultureInfo.GetCultures(CultureTypes.AllCultures).
               Select(cu => {
            LocaleIdentifier lid = null;
            { try { lid = LocaleIdentifier.Parse(cu.Name); } catch { } }
            if (
                lid == null ||
                string.IsNullOrEmpty(lid.Region) ||
                char.IsDigit(lid.Region[0]) ||
                cldrs.Contains(lid.Language)
                )
            {
                return null;
            }

            var res = CldrUtils.getNetRowData(cu, lid.Language, lid.Region, out LocaleIdentifier locId);
            return new LangMatrixRow {
                lang = locId.ToString(),
                row = res,
            };
        }).
               Where(lt => lt != null));
    }
Ejemplo n.º 4
0
    public void checkTexts(Dictionary <string, Dictionary <string, string> > protocol)
    {
        var locId  = LocaleIdentifier.Parse(lang);
        var wrongs = UnicodeBlocks.checkBlockNames(row, locId.Script);

        if (wrongs == null)
        {
            return;
        }
        protocol[lang] = wrongs;
    }
Ejemplo n.º 5
0
    static string[] getNetRowData(CultureInfo lc, string Language, string Region, out LocaleIdentifier locId)
    {
        var values = new string[count];
        var fmt    = lc.DateTimeFormat;

        fmt.MonthNames.Take(12).ToArray(values, monthsIdx);
        fmt.MonthGenitiveNames.Take(12).ToArray(values, smonthsIdx);
        fmt.DayNames.Take(7).ToArray(values, daysIdx);

        var script = UnicodeBlocks.getBlockNames(values).Select(kv => kv.Key).Single();

        locId = LocaleIdentifier.Parse(string.Format("{0}-{1}-{2}", Language, script, Region));
        return(values);
    }
Ejemplo n.º 6
0
    public static Langs.CldrLang[] getMissingLangs()
    {
        var wrongs   = new string[] { "ceb", "ht", "hmn", "la", "ny", "sm", "su", Langs.invariantId.Split('-')[0] };
        var wrongsEx = wrongs.Select(w => LocaleIdentifier.Parse(w).MostLikelySubtags()).ToArray();
        var newLangs = wrongsEx.
                       Select(l => new Langs.CldrLang {
            Id            = string.Format("{0}-{1}", l.Language, l.Region),
            Lang          = l.Language,
            ScriptId      = l.Script,
            DefaultRegion = l.Region,
            Regions       = new string[] { l.Region }
        }).
                       ToArray();

        //var newLangsStr = Json.SerializeStr(newLangs);
        return(newLangs);
    }
Ejemplo n.º 7
0
    public static void Build()
    {
        var cldr  = Json.Deserialize <Langs.CldrLang[]>(LangsDirs.dirCldrTexts);
        var roots = cldr.SelectMany(c => c.Regions.Select(r => LocaleIdentifier.Parse(string.Format("{0}-{1}-{2}", c.Lang, c.ScriptId, r)))).ToArray();
        var locs  = roots.Select(root => new Locale(root)).ToArray();

        // ALPHABETS
        var rx     = new Regex("[{} ]");
        var alphas = locs.Select(loc => {
            var localePattern = loc.Find("//characters");
            var data          = localePattern.SelectChildren(XPathNodeType.Element).OfType <XPathNavigator>().Where(al => al.Name == "exemplarCharacters").Select(al => {
                var value   = CldrUtils.decodeUnicodeLiteral(al.Value.Normalize().Trim('[', ']'));
                value       = rx.Replace(value, "");
                var keyNode = al.SelectSingleNode("./@type");
                var key     = keyNode == null ? "root" : keyNode.Value;
                //if (key == "numbers" || key == "punctuation" || string.IsNullOrEmpty(value)) return null;
                return(new { key, value });
            }).Where(n => n != null).ToArray();
            return(new {
                lang = loc.Id.ToString(),
                data
            });
        }).Where(d => d.data.Length > 0).OrderBy(l => l.lang).ToArray();

        Func <string, IEnumerable <LangMatrixRow> > alphasRes = (string key) => alphas.SelectMany(a => a.data.Where(aa => aa.key == key).Select(aa =>
                                                                                                                                                new LangMatrixRow {
            lang = a.lang, columnNames = Linq.Items(aa.key).ToArray(), row = Linq.Items(aa.value).ToArray()
        }));
        var alphaRoot        = new LangMatrix(alphasRes("root"), null, true);
        var alphaAuxlity     = new LangMatrix(alphasRes("auxiliary"), null, true);
        var alphaIndex       = new LangMatrix(alphasRes("index"), null, true);
        var alphaNumbers     = new LangMatrix(alphasRes("numbers"), null, true);
        var alphaPunctuation = new LangMatrix(alphasRes("punctuation"), null, true);

        var patterns = new LangMatrix(locs.Select(loc => {
            var localePattern   = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localePattern").ToString();
            var localeSeparator = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localeSeparator").ToString();
            return(new LangMatrixRow {
                lang = loc.Id.ToString(),
                row = new string[] { localePattern, localeSeparator },
                columnNames = new string[] { "pattern", "separator" },
            });
        }), null, true);
        Dictionary <string, Dictionary <string, string> > langsProtocol = new Dictionary <string, Dictionary <string, string> >();
        var langs = new LangMatrix(
            locs.Select(loc => fromCldr(loc, "//localeDisplayNames/languages")),
            langsProtocol, true
            );
        Dictionary <string, Dictionary <string, string> > scriptsProtocol = new Dictionary <string, Dictionary <string, string> >();
        var scripts = new LangMatrix(
            locs.Select(loc => fromCldr(loc, "//localeDisplayNames/scripts")),
            scriptsProtocol, true
            );
        Dictionary <string, Dictionary <string, string> > regionsProtocol = new Dictionary <string, Dictionary <string, string> >();
        var regions = new LangMatrix(
            locs.Select(loc => fromCldr(loc, "//localeDisplayNames/territories")),
            regionsProtocol, true
            );

        langs.save(LangsDesignDirs.cldr + "cldrNameLangs.csv", true);
        scripts.save(LangsDesignDirs.cldr + "cldrNameScripts.csv", true);
        regions.save(LangsDesignDirs.cldr + "cldrNameRegions.csv", true);
        patterns.save(LangsDesignDirs.cldr + "cldrNamePatterns.csv", true);
        alphaRoot.save(LangsDesignDirs.cldr + "alphaRoot.csv", true);
        alphaAuxlity.save(LangsDesignDirs.cldr + "alphaAuxlity.csv", true);
        alphaIndex.save(LangsDesignDirs.cldr + "alphaIndex.csv", true);
        alphaNumbers.save(LangsDesignDirs.cldr + "alphaNumbers.csv", true);
        alphaPunctuation.save(LangsDesignDirs.cldr + "alphaPunctuation.csv", true);

        // save to DART messages
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNameLangs.msg", Protobuf.ToBytes(matrixToDart(langs)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNameScripts.msg", Protobuf.ToBytes(matrixToDart(scripts)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNameRegions.msg", Protobuf.ToBytes(matrixToDart(regions)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\cldrNamePatterns.msg", Protobuf.ToBytes(matrixToDart(patterns)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaRoot.msg", Protobuf.ToBytes(matrixToDart(alphaRoot)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaAuxlity.msg", Protobuf.ToBytes(matrixToDart(alphaAuxlity)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaIndex.msg", Protobuf.ToBytes(matrixToDart(alphaIndex)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaNumbers.msg", Protobuf.ToBytes(matrixToDart(alphaNumbers)));
        File.WriteAllBytes(LangsDesignDirs.data + @"langsDesign\alphaPunctuation.msg", Protobuf.ToBytes(matrixToDart(alphaPunctuation)));

        //  var localePattern = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localePattern").ToString();
        //  var localeSeparator = loc.FindOrDefault("//localeDisplayNames/localeDisplayPattern/localeSeparator").ToString();
        //  fromCldr(loc, "//localeDisplayNames/languages");
        //  fromCldr(loc, "//localeDisplayNames/scripts");
        //  fromCldr(loc, "//localeDisplayNames/territories");
        //});
    }
Ejemplo n.º 8
0
    static void computeLangRegionEnglish(EnglishNames english)
    {
        var scripts = extractScript().ToDictionary(
            s => s.lang,
            s => s.script + (s.other == null ? "" : "|" + string.Join(",", s.other)));

        // **************************
        // ***** REGION-LANG table: <region,lang>,<population,isOfficial,script>
        var regionLang = supplements.Descendants("territoryInfo").
                         Single().
                         Elements("territory").
                         SelectMany(el => {
            var allPop = int.Parse(el.Attribute("population").Value);
            var region = el.Attribute("type").Value;
            return(el.Elements("languagePopulation").
                   Select(ee => {
                string lang = ee.Attribute("type").Value, key1 = $"{lang}|{region}", key2 = $"{lang}|";
                return new regLang {
                    region = region,
                    lang = lang,
                    population = (int)Math.Round((double)allPop * float.Parse(ee.Attribute("populationPercent")?.Value ?? "0", CultureInfo.InvariantCulture) / 100),
                    isOfficial = ee.Attribute("officialStatus") != null,
                };
            }));
        }).
                         ToArray();

        // **************************
        // ***** population grouping

        // group and sum by <region> or <lang>
        void groupAndSum(regLang[] data, bool byReg, bool noOffOnly = false)
        {
            string friendlyName(regLang d, bool isKey, bool sh = false)
            {
                var actId = byReg == isKey ? d.region : d.lang;

                if (sh)
                {
                    return(actId);
                }
                var actNames = byReg == isKey ? english.Teritories : english.Langs;

                return(actNames[actId]);
            }

            string stringDetail(IGrouping <string, regLang> g, bool officialOnly)
            {
                var sd = string.Join(", ", g.Where(l => officialOnly == l.isOfficial && l.population > 0).
                                     OrderByDescending(l => l.population).
                                     Select(l => $"{friendlyName(l, false, false)}:{friendlyName(l, false, true)}:{l.population}"));

                return(sd == "" ? null : sd);
            }

            var by = byReg ? data.GroupBy(d => d.region) : data.GroupBy(d => d.lang);

            var res = by.Select(g => new LangOrRegion {
                name      = friendlyName(g.First(), true),
                id        = g.Key,
                offSum    = g.Where(l => l.isOfficial).Sum(l => l.population),
                nonOffSum = g.Where(l => !l.isOfficial).Sum(l => l.population),
                off       = stringDetail(g, true),
                nonOff    = stringDetail(g, false),
                likely    = byReg ? null : LocaleIdentifier.Parse(g.Key).MostLikelySubtags().ToString(),
                scripts   = byReg ? null : scripts[g.Key],
            }).
                      Where(r => r.id != "und" && (noOffOnly ? r.offSum == 0 : r.offSum + r.nonOffSum > 0)).
                      OrderByDescending(r => r.offSum + r.nonOffSum).
                      ToArray();

            var nm = byReg ? "Regions" : (noOffOnly ? "LangsNotOfficial" : "Langs");

            Json.Serialize(LangsDesignDirs.root + $"patches\\cldr{nm}.json", res);
        }

        groupAndSum(regionLang, true);
        groupAndSum(regionLang, false, true);
        groupAndSum(regionLang, false);
    }
Ejemplo n.º 9
0
    static List <LangScripts> extractScript()
    {
        var validScrits = UnicodeBlocks.ISO15924.ToHashSet();

        validScrits.Add("Jpan"); validScrits.Add("Kore"); validScrits.Add("Hans"); validScrits.Add("Hant");

        var res = new List <LangScripts>();

        foreach (var row in XElement.Load(Directory.GetCurrentDirectory() + @"\cldr\langScripts.xml").Descendants("tr"))
        {
            // last TD with value.length==4 is SCRIPT
            var script = row.Elements().Select(el => el.Value.Trim()).Where(s => s.Length == 4).Last();

            // first TR with seven TD child defines LANG
            if (row.Elements().Count() == 7)
            {
                var s = new LangScripts {
                    lang = row.Descendants("a").Single().Attribute("name").Value
                };
                res.Add(s);
            }
            // obsolete script? (info in last node)
            var lastNodeValue = (row.LastNode as XElement).Value.Trim();
            var obsolete      = !validScrits.Contains(script) || lastNodeValue == "N" || lastNodeValue == "n/a";
            //
            if (!obsolete)
            {
                res.Last().other.Add(script);
            }
        }

        res = res.Where(l => allLangs.Contains(l.lang)).ToList();

        foreach (var s in res)
        {
            // no script found => add "2?" + likely
            var likely = LocaleIdentifier.Parse(s.lang).MostLikelySubtags();
            if (s.other.Count == 0)
            {
                s.script = "2?" + likely.Script;
                s.other  = null;
                continue;
            }

            // likely is not in found script => add "1?" + likely
            s.script = s.other.Contains(likely.Script) ? likely.Script : "1?" + likely.Script;
            s.other.Remove(likely.Script);
            if (s.other.Count == 0)
            {
                s.other = null;
            }
        }

        // add missing langs: add script from lang (e.g. Arab for uz_Arab) OR "3?" + likely
        foreach (var lang in allLangs.Except(res.Select(l => l.lang)))
        {
            res.Add(new LangScripts {
                lang = lang, script = lang.Contains('_') ? lang.Split('_')[1] : "3?" + LocaleIdentifier.Parse(lang).MostLikelySubtags().Script, other = null
            });
        }

        foreach (var lang in res)
        {
            lang.name = langTrans[lang.lang];
        }

        // sort, write and return
        res = res.OrderByDescending(s => s.other == null ? 0 : s.other.Count).ThenBy(s => s.lang).ToList();
        Json.Serialize(LangsDesignDirs.root + $"patches\\cldrScript.json", res);
        return(res);
    }