public LanguageSuggestion(IWritingSystemFactory wsFactory, string languageTag, string keyboardLayout)
            : base(wsFactory)
        {
            _languageTag    = languageTag;
            _keyboardLayout = keyboardLayout;
            LanguageSubtag languageSubtag;
            ScriptSubtag   scriptSubtag;
            RegionSubtag   regionSubtag;
            IEnumerable <VariantSubtag> variantSubtags;

            IetfLanguageTag.TryGetSubtags(languageTag, out languageSubtag, out scriptSubtag, out regionSubtag,
                                          out variantSubtags);

            var s = new StringBuilder();

            if (!string.IsNullOrEmpty(languageSubtag.Name))
            {
                s.Append(languageSubtag.Name);
            }
            if (scriptSubtag != null && !string.IsNullOrEmpty(scriptSubtag.Name) && !IetfLanguageTag.IsScriptImplied(languageTag))
            {
                s.AppendFormat("-{0}", scriptSubtag.Name);
            }
            if (regionSubtag != null && !string.IsNullOrEmpty(regionSubtag.Name))
            {
                s.AppendFormat("-{0}", regionSubtag.Name);
            }
            Label = s.ToString();
        }
示例#2
0
        /// <summary>
        /// Initializes a new instance of the <see cref="LanguageDataIndex"/> class.
        /// </summary>
        public LanguageDataIndex(IDictionary <string, string> sourcefiles)
        {
            string twotothreecodes = sourcefiles["TwoToThreeCodes.txt"];
            string subtagregistry  = sourcefiles["ianaSubtagRegistry.txt"];

            StandardSubtags.InitialiseIanaSubtags(twotothreecodes, subtagregistry);

            // First read in Ethnologue data file into temporary dictionary
            var threeToTwoLetter = StandardSubtags.TwoAndThreeMap(twotothreecodes, true);

            //LanguageIndex.txt Format: LangID	CountryID	NameType	Name
            //a language appears on one row for each of its alternative langauges
            string languageindex = sourcefiles["LanguageIndex.txt"];
            var    entries       = new List <string>(languageindex.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries));

            entries.Add("qaa\t?\tL\tUnlisted Language");

            foreach (string entry in entries.Skip(1))             //skip the header
            {
                string[] items = entry.Split('\t');
                if (items.Length != 4)
                {
                    continue;
                }
                if (items[2].StartsWith("!"))                 //temporary suppression of entries while waiting for Ethnologue changes
                {
                    continue;
                }
                // excluded by !
                // all gax (ET,KE,SO) including L
                // all gaz (ET) including L
                // all hae (ET) including L

                string code = items[0].Trim();
                string twoLetterCode;
                string threelettercode = code;
                if (threeToTwoLetter.TryGetValue(code, out twoLetterCode))
                {
                    code = twoLetterCode;
                }

                //temporary suppression of entries while waiting for Ethnologue changes (those excluded by !)
                if (ExcludedCodes.Contains(code))
                {
                    continue;
                }

                string       regionCode = items[1].Trim();
                LanguageInfo language   = GetOrCreateLanguageFromCode(code, threelettercode, regionCode == "?" ? "" : StandardSubtags.RegisteredRegions[regionCode].Name);

                string name = items[3].Trim();


                if (items[2].Trim() == "L")
                {
                    while (language.Names.Contains(name))
                    {
                        language.Names.Remove(name);
                    }
                    language.Names.Insert(0, name);
                }
                else
                {
                    if (items[2].Contains("P"))
                    {
                        //Skip pejorative
                    }
                    else if (ExcludedRegions.Contains(StandardSubtags.RegisteredRegions[regionCode].Name))
                    {
                        //Skip alternatives for Ethiopia, as per request
                    }
                    else if (code == "gax" || code == "om")
                    {
                        //For these two "Oromo" languages, skip all related languages as per request
                    }
                    else if (!language.Names.Contains(name))
                    {
                        language.Names.Add(name);                         //intentionally not lower-casing
                    }
                }
            }

            // Then for each registered ietf language tag create a real entry and add the ethnologue data to it
            IOrderedEnumerable <LanguageSubtag> languages = StandardSubtags.RegisteredLanguages.OrderBy(lang => lang.Iso3Code);

            foreach (LanguageSubtag language in languages)
            {
                bool singlename = false;
                if (language.IsDeprecated || ExcludedCodes.Contains(language.Code))
                {
                    continue;
                }
                LanguageInfo langinfo = GetOrCreateLanguageFromCode(language.Code, language.Iso3Code, null);
                langinfo.DesiredName     = language.Name.Replace("'", "’");
                langinfo.IsMacroLanguage = language.IsMacroLanguage;

                foreach (string country in langinfo.Countries)
                {
                    if (ExcludedRegions.Contains(country))
                    {
                        singlename = true;
                    }
                }

                foreach (string name in language.Names)
                {
                    string langname = name.Replace("'", "’");
                    if (!langinfo.Names.Contains(langname))
                    {
                        if (singlename && langinfo.Names.Count == 1)
                        {
                            // leave single ethnologue names
                            break;
                        }
                        else
                        {
                            langinfo.Names.Add(langname);
                        }
                    }
                    if (singlename)
                    {
                        break;
                    }
                }
                _codeToLanguageIndex.Add(language.Code, langinfo);
            }

            IEnumerable <IGrouping <string, string> > languageGroups = Sldr.LanguageTags.Where(info => info.IsAvailable && IetfLanguageTag.IsValid(info.LanguageTag))
                                                                       .Select(info => IetfLanguageTag.Canonicalize(info.LanguageTag))
                                                                       .GroupBy(IetfLanguageTag.GetLanguagePart);

            foreach (IGrouping <string, string> languageGroup in languageGroups)
            {
                string[] langTags = languageGroup.ToArray();
                if (langTags.Length == 1)
                {
                    string       langTag = langTags[0];
                    LanguageInfo language;
                    if (langTag != languageGroup.Key && _codeToLanguageIndex.TryGetValue(languageGroup.Key, out language))
                    {
                        _codeToLanguageIndex.Remove(languageGroup.Key);
                        language.LanguageTag          = langTag;
                        _codeToLanguageIndex[langTag] = language;
                    }
                }
                else
                {
                    foreach (string langTag in langTags)
                    {
                        LanguageSubtag languageSubtag;
                        ScriptSubtag   scriptSubtag;
                        RegionSubtag   regionSubtag;
                        IEnumerable <VariantSubtag> variantSubtags;
                        if (IetfLanguageTag.TryGetSubtags(langTag, out languageSubtag, out scriptSubtag, out regionSubtag, out variantSubtags))
                        {
                            if (langTag == languageSubtag)
                            {
                                continue;
                            }

                            LanguageInfo language      = GetOrCreateLanguageFromCode(langTag, langTag, regionSubtag == null ? "" : regionSubtag.Name);                        // changed to default to "" 2017-04-24
                            bool         displayScript = scriptSubtag != null && !IetfLanguageTag.IsScriptImplied(langTag);
                            LanguageInfo otherLanguage;
                            if (langTag != languageSubtag && !displayScript && _codeToLanguageIndex.TryGetValue(languageSubtag, out otherLanguage) && language.Countries.SetEquals(otherLanguage.Countries))
                            {
                                language.Names.AddRange(otherLanguage.Names);
                            }
                            else
                            {
                                string name = displayScript ? string.Format("{0} ({1})", languageSubtag.Name, scriptSubtag.Name) : languageSubtag.Name;
                                if (!language.Names.Contains(name))
                                {
                                    language.Names.Add(name);                                     //intentionally not lower-casing
                                }
                            }
                            LanguageInfo keylanguage;
                            if (_codeToLanguageIndex.TryGetValue(languageGroup.Key, out keylanguage))
                            {
                                language.IsMacroLanguage = keylanguage.IsMacroLanguage;
                            }
                            _codeToLanguageIndex.Add(langTag, language);
                        }
                    }
                }
            }

            string languagecodes = sourcefiles["LanguageCodes.txt"];
            var    codeentries   = new List <string>(languagecodes.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries));

            foreach (var languageCode in codeentries)
            {
                var data = languageCode.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
                if (data.Length < 2)
                {
                    continue;
                }
                var    langCode = data[0];
                string twoLetterCode;
                if (threeToTwoLetter.TryGetValue(langCode, out twoLetterCode))
                {
                    langCode = twoLetterCode;
                }
                if (langCode == "fuv")
                {
                    langCode = "fuv-Arab";                      // special case because the script has been added to this language code
                }
                // which is probably something to do with the SLDR
                var          countryCode = data[1];
                LanguageInfo lang;
                if (_codeToLanguageIndex.TryGetValue(langCode, out lang))
                {
                    lang.PrimaryCountry = StandardSubtags.RegisteredRegions[countryCode].Name;
                }
            }

            // localise some language names
            foreach (LanguageInfo languageInfo in _codeToLanguageIndex.Values)
            {
                if (languageInfo.Names.Count == 0)
                {
                    continue;                     // this language is suppressed
                }
                //Why just this small set? Only out of convenience. Ideally we'd have a db of all languages as they write it in their literature.
                string localName = null;
                switch (languageInfo.Names[0])
                {
                case "French":
                    localName = "français";
                    break;

                case "Spanish":
                    localName = "español";
                    break;

                case "Chinese":
                    localName = "中文";
                    break;

                case "Hindi":
                    localName = "हिन्दी";
                    break;

                case "Bengali":
                    localName = "বাংলা";
                    break;

                case "Telugu":
                    localName = "తెలుగు";
                    break;

                case "Tamil":
                    localName = "தமிழ்";
                    break;

                case "Urdu":
                    localName = "اُردُو";
                    break;

                case "Arabic":
                    localName = "العربية/عربي";
                    break;

                case "Thai":
                    localName = "ภาษาไทย";
                    break;

                case "Indonesian":
                    localName = "Bahasa Indonesia";
                    break;
                }
                if (!string.IsNullOrEmpty(localName))
                {
                    if (languageInfo.Names.Contains(localName))
                    {
                        languageInfo.Names.Remove(localName);
                    }
                    languageInfo.Names.Insert(0, localName);
                    languageInfo.DesiredName = localName;
                }

                switch (languageInfo.ThreeLetterTag)
                {
                case "itd":                         // 2 temporary special cases because the LanguageCodes.txt files needs to be updated with LanguageIndex.txt
                    languageInfo.PrimaryCountry = "Indonesia";
                    break;

                case "xak":
                    languageInfo.PrimaryCountry = "Venezuela";
                    break;

                default:
                    // Also set the PrimaryCountry if there is only one country
                    if (String.IsNullOrEmpty(languageInfo.PrimaryCountry) && languageInfo.Countries.Count == 1)
                    {
                        languageInfo.PrimaryCountry = languageInfo.Countries.First();
                    }
                    break;
                }
            }

            // check if any languages are found in multiple countries but do not have a primary country
            // there is a test for this in LanguageLookupTests.llExpectedLanguagesHaveUniquePrimaryCountries
            var languagesWithoutRegions = new List <LanguageInfo>();

            foreach (var lang in _codeToLanguageIndex.Values)
            {
                if (String.IsNullOrEmpty(lang.PrimaryCountry))
                {
                    languagesWithoutRegions.Add(lang);
                }
            }
            var languagesWithAmbiguousPrimaryCountry = languagesWithoutRegions.Where(l => l.Countries.Count() > 1);

            foreach (var lang in languagesWithAmbiguousPrimaryCountry)
            {
                Console.WriteLine("Language {0}({1}) has no primary country but is found in multiple countries", lang.DesiredName, lang.LanguageTag);
            }
        }
        /// <summary>
        /// Initializes a new instance of the <see cref="LanguageLookup"/> class.
        /// </summary>
        public LanguageLookup()
        {
            var threeToTwoLetter = new Dictionary <string, string>();

            foreach (string line in LanguageRegistryResources.TwoToThreeCodes.Replace("\r\n", "\n").Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries))
            {
                string[] items = line.Split('\t');
                threeToTwoLetter.Add(items[1].Trim(), items[0].Trim());
            }

            //LanguageIndex.txt Format: LangID	CountryID	NameType	Name
            //a language appears on one row for each of its alternative langauges
            var entries = new List <string>(LanguageRegistryResources.LanguageIndex.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries));

            entries.Add("qaa\t?\tL\tUnlisted Language");
            foreach (string entry in entries.Skip(1))             //skip the header
            {
                string[] items = entry.Split('\t');
                if (items.Length != 4)
                {
                    continue;
                }
                if (items[2].Contains('!'))                //temporary suppression of entries while waiting for Ethnologue changes
                {
                    continue;
                }

                string code = items[0].Trim();
                string twoLetterCode;
                if (threeToTwoLetter.TryGetValue(code, out twoLetterCode))
                {
                    code = twoLetterCode;
                }

                string       regionCode = items[1].Trim();
                LanguageInfo language   = GetOrCreateLanguageFromCode(code, regionCode == "?" ? "?" : StandardSubtags.RegisteredRegions[regionCode].Name);

                string name = items[3].Trim();


                if (items[2] == "L")
                {
                    while (language.Names.Contains(name))
                    {
                        language.Names.Remove(name);
                    }
                    language.Names.Insert(0, name);
                }
                else
                {
                    if (items[2].Contains("P"))
                    {
                        //Skip pejorative
                    }
                    else if (items[1] == ("ET"))
                    {
                        //Skip alternatives for Ethiopia, as per request
                    }
                    else if (items[0] == "gax" || items[0] == "om")
                    {
                        //For these two "Oromo" languages, skip all related languages as per request
                    }
                    else if (!language.Names.Contains(name))
                    {
                        language.Names.Add(name);                         //intentionally not lower-casing
                    }
                }
            }

            IEnumerable <IGrouping <string, string> > languageGroups = Sldr.LanguageTags.Where(info => info.IsAvailable && IetfLanguageTag.IsValid(info.LanguageTag))
                                                                       .Select(info => IetfLanguageTag.Canonicalize(info.LanguageTag))
                                                                       .GroupBy(IetfLanguageTag.GetLanguagePart);

            foreach (IGrouping <string, string> languageGroup in languageGroups)
            {
                string[] langTags = languageGroup.ToArray();
                if (langTags.Length == 1)
                {
                    string       langTag = langTags[0];
                    LanguageInfo language;
                    if (langTag != languageGroup.Key && _codeToLanguageIndex.TryGetValue(languageGroup.Key, out language))
                    {
                        _codeToLanguageIndex.Remove(languageGroup.Key);
                        language.LanguageTag          = langTag;
                        _codeToLanguageIndex[langTag] = language;
                    }
                }
                else
                {
                    foreach (string langTag in langTags)
                    {
                        LanguageSubtag languageSubtag;
                        ScriptSubtag   scriptSubtag;
                        RegionSubtag   regionSubtag;
                        IEnumerable <VariantSubtag> variantSubtags;
                        if (IetfLanguageTag.TryGetSubtags(langTag, out languageSubtag, out scriptSubtag, out regionSubtag, out variantSubtags))
                        {
                            if (langTag == languageSubtag)
                            {
                                continue;
                            }

                            LanguageInfo language      = GetOrCreateLanguageFromCode(langTag, regionSubtag == null ? "?" : regionSubtag.Name);
                            bool         displayScript = scriptSubtag != null && !IetfLanguageTag.IsScriptImplied(langTag);
                            LanguageInfo otherLanguage;
                            if (langTag != languageSubtag && !displayScript && _codeToLanguageIndex.TryGetValue(languageSubtag, out otherLanguage) && language.Countries.SetEquals(otherLanguage.Countries))
                            {
                                language.Names.AddRange(otherLanguage.Names);
                            }
                            else
                            {
                                string name = displayScript ? string.Format("{0} ({1})", languageSubtag.Name, scriptSubtag.Name) : languageSubtag.Name;
                                if (!language.Names.Contains(name))
                                {
                                    language.Names.Add(name);                                     //intentionally not lower-casing
                                }
                            }
                        }
                    }
                }
            }

            foreach (LanguageInfo languageInfo in _codeToLanguageIndex.Values)
            {
                foreach (string name in languageInfo.Names)
                {
                    GetOrCreateListFromName(name).Add(languageInfo);
                }

                if (languageInfo.Names.Count == 0)
                {
                    continue;                     // this language is suppressed
                }
                //Why just this small set? Only out of convenience. Ideally we'd have a db of all languages as they write it in their literature.
                string localName = null;
                switch (languageInfo.Names[0])
                {
                case "French":
                    localName = "français";
                    break;

                case "Spanish":
                    localName = "español";
                    break;

                case "Chinese":
                    localName = "中文";
                    break;

                case "Hindi":
                    localName = "हिन्दी";
                    break;

                case "Bengali":
                    localName = "বাংলা";
                    break;

                case "Telugu":
                    localName = "తెలుగు";
                    break;

                case "Tamil":
                    localName = "தமிழ்";
                    break;

                case "Urdu":
                    localName = "اُردُو";
                    break;

                case "Arabic":
                    localName = "العربية/عربي";
                    break;

                case "Thai":
                    localName = "ภาษาไทย";
                    break;

                case "Indonesian":
                    localName = "Bahasa Indonesia";
                    break;
                }
                if (!string.IsNullOrEmpty(localName))
                {
                    if (!languageInfo.Names.Remove(localName))
                    {
                        GetOrCreateListFromName(localName).Add(languageInfo);
                    }
                    languageInfo.Names.Insert(0, localName);
                }
            }
        }
 public void IsScriptImplied_ReturnsExpectedResults(string tag, bool expectedResult)
 {
     Assert.That(IetfLanguageTag.IsScriptImplied(tag), Is.EqualTo(expectedResult));
 }