Пример #1
0
 public void Canonicalize_ImplicitScript_SuppressesScript()
 {
     Assert.That(IetfLanguageTag.Canonicalize("en-Latn-US"), Is.EqualTo("en-US"));
     Assert.That(IetfLanguageTag.Canonicalize("zh-hans-Cn-x-stuff"), Is.EqualTo("zh-CN-x-stuff"));
     Assert.That(IetfLanguageTag.Canonicalize("zH-hans-Cn"), Is.EqualTo("zh-CN"));
     Assert.That(IetfLanguageTag.Canonicalize("zH-Hant-tW-x-stuff"), Is.EqualTo("zh-TW-x-stuff"));
     Assert.That(IetfLanguageTag.Canonicalize("Zh-hant-Tw"), Is.EqualTo("zh-TW"));
     Assert.That(IetfLanguageTag.Canonicalize("oro-Latn"), Is.EqualTo("oro"));
 }
Пример #2
0
        private bool TryGetNewLangTag(string oldTag, out string newTag)
        {
            if (m_tagMap.TryGetValue(oldTag, out newTag))
            {
                return(!newTag.Equals(oldTag, StringComparison.InvariantCultureIgnoreCase));
            }

            var cleaner = new IetfLanguageTagCleaner(oldTag);

            cleaner.Clean();
            newTag = cleaner.GetCompleteTag();
            while (m_tagMap.Values.Contains(newTag, StringComparer.InvariantCultureIgnoreCase))
            {
                // We can't use this tag because it would conflict with what we are mapping something else to.
                cleaner = new IetfLanguageTagCleaner(cleaner.Language, cleaner.Script, cleaner.Region, cleaner.Variant,
                                                     WritingSystemIdMigrator.GetNextDuplPart(cleaner.PrivateUse));
                newTag = cleaner.GetCompleteTag();
            }

            newTag = IetfLanguageTag.Canonicalize(newTag);

            m_tagMap[oldTag] = newTag;
            return(!newTag.Equals(oldTag, StringComparison.InvariantCultureIgnoreCase));
        }
Пример #3
0
        /// <summary>
        /// API to query the SLDR for an LDML file and save it locally in the SLDR cache and specified directories
        /// </summary>
        /// <param name="destinationPath">Destination path to save the requested LDML file</param>
        /// <param name="languageTag">Current IETF language tag</param>
        /// <param name="topLevelElements">List of top level element names to request. SLDR will always publish identity, so it doesn't need to be requested.
        /// If null, the entire LDML file will be requested.</param>
        /// <param name="filename">Saved filename</param>
        /// <returns>Enum status SldrStatus if file could be retrieved and the source</returns>
        public static SldrStatus GetLdmlFile(string destinationPath, string languageTag, IEnumerable <string> topLevelElements, out string filename)
        {
            CheckInitialized();

            if (String.IsNullOrEmpty(destinationPath))
            {
                throw new ArgumentException("destinationPath");
            }
            if (!Directory.Exists(destinationPath))
            {
                throw new DirectoryNotFoundException("destinationPath");
            }
            if (String.IsNullOrEmpty(languageTag) || (!IetfLanguageTag.IsValid(languageTag)))
            {
                throw new ArgumentException("ietfLanguageTag");
            }
            if (topLevelElements == null)
            {
                throw new ArgumentNullException("topLevelElements");
            }

            string sldrLanguageTag = IetfLanguageTag.Canonicalize(languageTag);
            SldrLanguageTagInfo langTagInfo;

            if (LanguageTags.TryGet(sldrLanguageTag, out langTagInfo))
            {
                sldrLanguageTag = langTagInfo.SldrLanguageTag;
            }
            string[] topLevelElementsArray = topLevelElements.ToArray();

            using (_sldrCacheMutex.Lock())
            {
                var status = SldrStatus.NotFound;
                CreateSldrCacheDirectory();
                string sldrCacheFilePath;
                bool   redirected;
                do
                {
                    string revid, uid = "", tempString;
                    if (destinationPath == SldrCachePath)
                    {
                        filename = string.Format("{0}.{1}", sldrLanguageTag, LdmlExtension);
                    }
                    else
                    {
                        filename = string.Format("{0}.{1}", languageTag, LdmlExtension);
                        // Check if LDML file already exists in destination and read revid and uid
                        if (!ReadSilIdentity(Path.Combine(destinationPath, filename), out tempString, out uid))
                        {
                            uid = DefaultUserId;
                        }
                    }

                    // If languageTag contains fonipa, don't bother trying to access the SLDR
                    if (sldrLanguageTag.Contains(WellKnownSubtags.IpaVariant) || sldrLanguageTag.Contains(WellKnownSubtags.AudioScript))
                    {
                        return(SldrStatus.NotFound);
                    }

                    sldrCacheFilePath = Path.Combine(SldrCachePath, !string.IsNullOrEmpty(uid) && uid != DefaultUserId ? string.Format("{0}-{1}.{2}", sldrLanguageTag, uid, LdmlExtension)
                                                : string.Format("{0}.{1}", sldrLanguageTag, LdmlExtension));
                    // Read revid from cache file
                    ReadSilIdentity(sldrCacheFilePath, out revid, out tempString);

                    // Concatenate parameters for url string
                    string requestedElements = string.Empty;
                    if (topLevelElementsArray.Length > 0)
                    {
                        requestedElements = string.Format("&inc[]={0}", string.Join("&inc[]=", topLevelElementsArray));
                    }
                    string requestedUserId = !string.IsNullOrEmpty(uid) ? string.Format("&uid={0}", uid) : string.Empty;
                    string requestedRevid  = !string.IsNullOrEmpty(revid) ? string.Format("&revid={0}", revid) : string.Empty;
                    string url             = string.Format("{0}{1}?ext={2}&flatten=1{3}{4}{5}",
                                                           SldrRepository, sldrLanguageTag, LdmlExtension,
                                                           requestedElements, requestedUserId, requestedRevid);

                    string tempFilePath = sldrCacheFilePath + "." + TmpExtension;

                    // Using WebRequest instead of WebClient so we have access to disable AllowAutoRedirect
                    var webRequest = (HttpWebRequest)WebRequest.Create(Uri.EscapeUriString(url));
                    webRequest.AllowAutoRedirect = false;
                    webRequest.UserAgent         = UserAgent;
                    webRequest.Timeout           = 10000;

                    try
                    {
                        if (_offlineMode)
                        {
                            throw new WebException("Test mode: SLDR offline so accessing cache", WebExceptionStatus.ConnectFailure);
                        }

                        // Check the response header to see if the requested LDML file got redirected
                        using (var webResponse = (HttpWebResponse)webRequest.GetResponse())
                        {
                            if (webResponse.StatusCode == HttpStatusCode.NotModified)
                            {
                                // Report status that file is the most current from SLDR
                                status     = SldrStatus.FromSldr;
                                redirected = false;
                            }
                            else if (webResponse.StatusCode == HttpStatusCode.MovedPermanently)
                            {
                                // Extract ietfLanguageTag from the response header
                                var parsedresponse = HttpUtilityFromMono.ParseQueryString(webResponse.Headers["Location"]);
                                sldrLanguageTag = parsedresponse.Get("ws_id").Split('?')[0];
                                redirected      = true;
                            }
                            else
                            {
                                // Download the LDML file to a temp file in case the transfer gets interrupted
                                using (Stream responseStream = webResponse.GetResponseStream())
                                    using (var fs = new FileStream(tempFilePath, FileMode.OpenOrCreate, FileAccess.Write))
                                    {
                                        var buff = new byte[102400];
                                        int c;
                                        while ((c = responseStream.Read(buff, 0, buff.Length)) > 0)
                                        {
                                            fs.Write(buff, 0, c);
                                            fs.Flush();
                                        }
                                    }

                                status            = SldrStatus.FromSldr;
                                sldrCacheFilePath = MoveTmpToCache(tempFilePath, uid);
                                redirected        = false;
                            }
                        }
                    }
                    catch (WebException we)
                    {
                        // Return from 404 error
                        var errorResponse = (HttpWebResponse)we.Response;
                        if ((we.Status == WebExceptionStatus.ProtocolError) && (errorResponse.StatusCode == HttpStatusCode.NotFound))
                        {
                            return(SldrStatus.NotFound);
                        }

                        string sldrCacheFilename;
                        // Download failed so check SLDR cache
                        if (!string.IsNullOrEmpty(uid) && (uid != DefaultUserId))
                        {
                            sldrCacheFilename = string.Format("{0}-{1}.{2}", sldrLanguageTag, uid, LdmlExtension);
                        }
                        else
                        {
                            sldrCacheFilename = string.Format("{0}.{1}", sldrLanguageTag, LdmlExtension);
                        }
                        sldrCacheFilePath = Path.Combine(SldrCachePath, sldrCacheFilename);
                        if (File.Exists(sldrCacheFilePath))
                        {
                            status = SldrStatus.FromCache;
                        }
                        else
                        {
                            return(SldrStatus.UnableToConnectToSldr);
                        }
                        redirected = false;
                    }
                    finally
                    {
                        if (File.Exists(tempFilePath))
                        {
                            File.Delete(tempFilePath);
                        }
                    }
                } while (redirected);

                if (destinationPath != SldrCachePath)
                {
                    // Copy from Cache to destination (w/o uid in filename), overwriting whatever used to be there
                    File.Copy(sldrCacheFilePath, Path.Combine(destinationPath, filename), true);
                }

                return(status);
            }
        }
        public override void Migrate(string sourceFilePath, string destinationFilePath)
        {
            string sourceFileName = Path.GetFileName(sourceFilePath);

            var writingSystemDefinitionV1 = new WritingSystemDefinitionV1();

            new LdmlAdaptorV1().Read(sourceFilePath, writingSystemDefinitionV1);

            string abbreviation = writingSystemDefinitionV1.Abbreviation;
            float  defaultFontSize = writingSystemDefinitionV1.DefaultFontSize;
            string keyboard = writingSystemDefinitionV1.Keyboard;
            string spellCheckingId = writingSystemDefinitionV1.SpellCheckingId;
            string defaultFontName = writingSystemDefinitionV1.DefaultFontName;
            string languageName = writingSystemDefinitionV1.LanguageName.IsOneOf("Unknown Language", "Language Not Listed") ? string.Empty : writingSystemDefinitionV1.LanguageName;
            string variant, privateUse;

            IetfLanguageTag.SplitVariantAndPrivateUse(writingSystemDefinitionV1.Variant, out variant, out privateUse);
            var langTagCleaner = new IetfLanguageTagCleaner(writingSystemDefinitionV1.Language, writingSystemDefinitionV1.Script, writingSystemDefinitionV1.Region,
                                                            variant, privateUse);

            langTagCleaner.Clean();
            string                    langTag           = IetfLanguageTag.Canonicalize(langTagCleaner.GetCompleteTag());
            List <string>             knownKeyboards    = writingSystemDefinitionV1.KnownKeyboards.Select(k => string.IsNullOrEmpty(k.Locale) ? k.Layout : string.Format("{0}_{1}", k.Locale, k.Layout)).ToList();
            bool                      isGraphiteEnabled = false;
            string                    legacyMapping     = string.Empty;
            string                    scriptName        = string.Empty;
            string                    regionName        = string.Empty;
            string                    variantName       = string.Empty;
            SystemCollationDefinition scd = null;

            // Create system collation definition if applicable
            if ((writingSystemDefinitionV1.SortUsing == WritingSystemDefinitionV1.SortRulesType.OtherLanguage) && (!string.IsNullOrEmpty(writingSystemDefinitionV1.SortRules)))
            {
                scd = new SystemCollationDefinition {
                    LanguageTag = writingSystemDefinitionV1.SortRules
                }
            }
            ;

            // Migrate fields from legacy fw namespace, and then remove fw namespace
            XElement ldmlElem = XElement.Load(sourceFilePath);
            XElement fwElem   = ldmlElem.Elements("special").FirstOrDefault(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "fw")));

            if (fwElem != null)
            {
                XElement graphiteEnabledElem = fwElem.Element(FW + "graphiteEnabled");
                if (graphiteEnabledElem != null)
                {
                    if (!bool.TryParse((string)graphiteEnabledElem.Attribute("value"), out isGraphiteEnabled))
                    {
                        isGraphiteEnabled = false;
                    }
                }

                // LegacyMapping
                XElement legacyMappingElem = fwElem.Element(FW + "legacyMapping");
                if (legacyMappingElem != null)
                {
                    legacyMapping = (string)legacyMappingElem.Attribute("value");
                }

                // ScriptName
                XElement scriptNameElem = fwElem.Element(FW + "scriptName");
                if (scriptNameElem != null)
                {
                    scriptName = (string)scriptNameElem.Attribute("value");
                }

                // RegionName
                XElement regionNameElem = fwElem.Element(FW + "regionName");
                if (regionNameElem != null)
                {
                    regionName = (string)regionNameElem.Attribute("value");
                }

                // VariantName
                XElement variantNameElem = fwElem.Element(FW + "variantName");
                if (variantNameElem != null)
                {
                    variantName = (string)variantNameElem.Attribute("value");
                }
            }

            // Record the details for use in PostMigrate where we change the file name to match the ieft language tag where we can.
            var migrationInfo = new LdmlMigrationInfo(sourceFileName)
            {
                LanguageTagBeforeMigration = writingSystemDefinitionV1.Bcp47Tag,
                LanguageTagAfterMigration  = langTag,
                RemovedPropertiesSetter    = ws =>
                {
                    if (!string.IsNullOrEmpty(abbreviation))
                    {
                        ws.Abbreviation = abbreviation;
                    }
                    if (defaultFontSize != 0)
                    {
                        ws.DefaultFontSize = defaultFontSize;
                    }
                    if (!string.IsNullOrEmpty(keyboard))
                    {
                        ws.Keyboard = keyboard;
                    }
                    if (!string.IsNullOrEmpty(spellCheckingId))
                    {
                        ws.SpellCheckingId = spellCheckingId;
                    }
                    if (!string.IsNullOrEmpty(defaultFontName))
                    {
                        ws.DefaultFont = ws.Fonts[defaultFontName];
                    }
                    if (!string.IsNullOrEmpty(languageName))
                    {
                        ws.Language = new LanguageSubtag(ws.Language, languageName);
                    }
                    ws.IsGraphiteEnabled = isGraphiteEnabled;
                    if (!string.IsNullOrEmpty(legacyMapping))
                    {
                        ws.LegacyMapping = legacyMapping;
                    }
                    if (!string.IsNullOrEmpty(scriptName) && ws.Script != null && ws.Script.IsPrivateUse)
                    {
                        ws.Script = new ScriptSubtag(ws.Script, scriptName);
                    }
                    if (!string.IsNullOrEmpty(regionName) && ws.Region != null && ws.Region.IsPrivateUse)
                    {
                        ws.Region = new RegionSubtag(ws.Region, regionName);
                    }
                    if (scd != null)
                    {
                        ws.DefaultCollation = scd;
                    }
                    foreach (string keyboardId in knownKeyboards)
                    {
                        IKeyboardDefinition kd;
                        if (!Keyboard.Controller.TryGetKeyboard(keyboardId, out kd))
                        {
                            kd = Keyboard.Controller.CreateKeyboard(keyboardId, KeyboardFormat.Unknown, Enumerable.Empty <string>());
                        }
                        ws.KnownKeyboards.Add(kd);
                    }
                }
            };

            _migrationInfo.Add(migrationInfo);

            // Store things that stay in ldml but are being moved: WindowsLcid, variantName, font, known keyboards, collations, font features, character sets

            // misc properties
            var staging = new Staging
            {
                WindowsLcid     = writingSystemDefinitionV1.WindowsLcid,
                DefaultFontName = writingSystemDefinitionV1.DefaultFontName,
                SortUsing       = writingSystemDefinitionV1.SortUsing,
                SortRules       = writingSystemDefinitionV1.SortRules,
            };

            // Determine if variantName is non-common private use before preserving it
            if (!string.IsNullOrEmpty(variantName))
            {
                int index = IetfLanguageTag.GetIndexOfFirstNonCommonPrivateUseVariant(IetfLanguageTag.GetVariantSubtags(migrationInfo.LanguageTagAfterMigration));
                if (index > -1)
                {
                    staging.VariantName = variantName;
                }
            }

            if (fwElem != null)
            {
                // DefaultFontFeatures
                XElement fontFeatsElem = fwElem.Element(FW + "defaultFontFeatures");
                if (fontFeatsElem != null && !string.IsNullOrEmpty(staging.DefaultFontName))
                {
                    staging.DefaultFontFeatures = (string)fontFeatsElem.Attribute("value");
                }

                //MatchedPairs, PunctuationPatterns, QuotationMarks deprecated

                // Valid Chars
                XElement validCharsElem = fwElem.Element(FW + "validChars");
                if (validCharsElem != null)
                {
                    try
                    {
                        var fwValidCharsElem = XElement.Parse((string)validCharsElem.Attribute("value"));
                        AddCharacterSet(fwValidCharsElem, staging, "WordForming", "main");
                        AddCharacterSet(fwValidCharsElem, staging, "Numeric", "numeric");
                        AddCharacterSet(fwValidCharsElem, staging, "Other", "punctuation");
                    }
                    catch (XmlException)
                    {
                        ParseLegacyWordformingCharOverridesFile(staging);
                    }
                }
            }

            _staging[sourceFileName] = staging;
        }
        public override void PostMigrate(string sourcePath, string destinationPath)
        {
            EnsureIeftLanguageTagsUnique(_migrationInfo);

            // Write them back, with their new file name.
            foreach (LdmlMigrationInfo migrationInfo in _migrationInfo)
            {
                Staging staging             = _staging[migrationInfo.FileName];
                string  sourceFilePath      = Path.Combine(sourcePath, migrationInfo.FileName);
                string  destinationFilePath = Path.Combine(destinationPath, migrationInfo.LanguageTagAfterMigration + ".ldml");

                XElement ldmlElem = XElement.Load(sourceFilePath);
                // Remove legacy palaso namespace from sourceFilePath
                ldmlElem.Elements("special").Where(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "palaso"))).Remove();
                ldmlElem.Elements("special").Where(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "palaso2"))).Remove();
                ldmlElem.Elements("special").Where(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "fw"))).Remove();

                // Remove collations to repopulate later
                ldmlElem.Elements("collations").Remove();

                // Write out the elements.
                XElement identityElem = ldmlElem.Element("identity");
                WriteIdentityElement(identityElem, staging, IetfLanguageTag.Canonicalize(migrationInfo.LanguageTagAfterMigration));

                var layoutElement = ldmlElem.Element("layout");
                WriteLayoutElement(layoutElement);

                if (staging.CharacterSets.ContainsKey("main") || staging.CharacterSets.ContainsKey("punctuation"))
                {
                    XElement charactersElem = ldmlElem.GetOrCreateElement("characters");
                    WriteCharactersElement(charactersElem, staging);
                }

                if (staging.CharacterSets.ContainsKey("numeric"))
                {
                    XElement numbersElem = ldmlElem.GetOrCreateElement("numbers");
                    WriteNumbersElement(numbersElem, staging);
                }

                if (staging.SortUsing != WritingSystemDefinitionV1.SortRulesType.OtherLanguage)
                {
                    XElement collationsElem = ldmlElem.GetOrCreateElement("collations");
                    WriteCollationsElement(collationsElem, staging);
                }

                // If needed, create top level special for external resources
                if (!string.IsNullOrEmpty(staging.DefaultFontName))
                {
                    // Create special element
                    XElement specialElem = CreateSpecialElement(ldmlElem);
                    WriteTopLevelSpecialElements(specialElem, staging);
                }

                var writerSettings = CanonicalXmlSettings.CreateXmlWriterSettings();
                writerSettings.NewLineOnAttributes = false;
                using (var writer = XmlWriter.Create(destinationFilePath, writerSettings))
                    ldmlElem.WriteTo(writer);

                if (migrationInfo.LanguageTagBeforeMigration != migrationInfo.LanguageTagAfterMigration)
                {
                    _auditLog.LogChange(migrationInfo.LanguageTagBeforeMigration, migrationInfo.LanguageTagAfterMigration);
                }
            }
            if (_migrationHandler != null)
            {
                _migrationHandler(ToVersion, _migrationInfo);
            }
        }
Пример #6
0
 public void Canonicalize_NonImplicitScript_DoesNotSuppressScript()
 {
     Assert.That(IetfLanguageTag.Canonicalize("en-Cyrl-US"), Is.EqualTo("en-Cyrl-US"));
     Assert.That(IetfLanguageTag.Canonicalize("sr-Latn"), Is.EqualTo("sr-Latn"));
 }
Пример #7
0
 public void Canonicalize_NonStandardCapitalization_StandardCapitalization()
 {
     Assert.That(IetfLanguageTag.Canonicalize("zH-latn-cn-FonIpa-X-Etic"), Is.EqualTo("zh-Latn-CN-fonipa-x-etic"));
 }
        /// <summary>
        /// Initializes a new instance of the <see cref="LanguageLookup"/> class.
        /// </summary>
        public LanguageLookup()
        {
            var threeToTwoLetter = new Dictionary <string, string>();

            foreach (string line in LanguageRegistryResources.TwoToThreeCodes.Replace("\r\n", "\n").Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries))
            {
                string[] items = line.Split('\t');
                threeToTwoLetter.Add(items[1].Trim(), items[0].Trim());
            }

            //LanguageIndex.txt Format: LangID	CountryID	NameType	Name
            //a language appears on one row for each of its alternative langauges
            var entries = new List <string>(LanguageRegistryResources.LanguageIndex.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries));

            entries.Add("qaa\t?\tL\tUnlisted Language");
            foreach (string entry in entries.Skip(1))             //skip the header
            {
                string[] items = entry.Split('\t');
                if (items.Length != 4)
                {
                    continue;
                }
                if (items[2].Contains('!'))                //temporary suppression of entries while waiting for Ethnologue changes
                {
                    continue;
                }

                string code = items[0].Trim();
                string twoLetterCode;
                if (threeToTwoLetter.TryGetValue(code, out twoLetterCode))
                {
                    code = twoLetterCode;
                }

                string       regionCode = items[1].Trim();
                LanguageInfo language   = GetOrCreateLanguageFromCode(code, regionCode == "?" ? "?" : StandardSubtags.RegisteredRegions[regionCode].Name);

                string name = items[3].Trim();


                if (items[2] == "L")
                {
                    while (language.Names.Contains(name))
                    {
                        language.Names.Remove(name);
                    }
                    language.Names.Insert(0, name);
                }
                else
                {
                    if (items[2].Contains("P"))
                    {
                        //Skip pejorative
                    }
                    else if (items[1] == ("ET"))
                    {
                        //Skip alternatives for Ethiopia, as per request
                    }
                    else if (items[0] == "gax" || items[0] == "om")
                    {
                        //For these two "Oromo" languages, skip all related languages as per request
                    }
                    else if (!language.Names.Contains(name))
                    {
                        language.Names.Add(name);                         //intentionally not lower-casing
                    }
                }
            }

            IEnumerable <IGrouping <string, string> > languageGroups = Sldr.LanguageTags.Where(info => info.IsAvailable && IetfLanguageTag.IsValid(info.LanguageTag))
                                                                       .Select(info => IetfLanguageTag.Canonicalize(info.LanguageTag))
                                                                       .GroupBy(IetfLanguageTag.GetLanguagePart);

            foreach (IGrouping <string, string> languageGroup in languageGroups)
            {
                string[] langTags = languageGroup.ToArray();
                if (langTags.Length == 1)
                {
                    string       langTag = langTags[0];
                    LanguageInfo language;
                    if (langTag != languageGroup.Key && _codeToLanguageIndex.TryGetValue(languageGroup.Key, out language))
                    {
                        _codeToLanguageIndex.Remove(languageGroup.Key);
                        language.LanguageTag          = langTag;
                        _codeToLanguageIndex[langTag] = language;
                    }
                }
                else
                {
                    foreach (string langTag in langTags)
                    {
                        LanguageSubtag languageSubtag;
                        ScriptSubtag   scriptSubtag;
                        RegionSubtag   regionSubtag;
                        IEnumerable <VariantSubtag> variantSubtags;
                        if (IetfLanguageTag.TryGetSubtags(langTag, out languageSubtag, out scriptSubtag, out regionSubtag, out variantSubtags))
                        {
                            if (langTag == languageSubtag)
                            {
                                continue;
                            }

                            LanguageInfo language      = GetOrCreateLanguageFromCode(langTag, regionSubtag == null ? "?" : regionSubtag.Name);
                            bool         displayScript = scriptSubtag != null && !IetfLanguageTag.IsScriptImplied(langTag);
                            LanguageInfo otherLanguage;
                            if (langTag != languageSubtag && !displayScript && _codeToLanguageIndex.TryGetValue(languageSubtag, out otherLanguage) && language.Countries.SetEquals(otherLanguage.Countries))
                            {
                                language.Names.AddRange(otherLanguage.Names);
                            }
                            else
                            {
                                string name = displayScript ? string.Format("{0} ({1})", languageSubtag.Name, scriptSubtag.Name) : languageSubtag.Name;
                                if (!language.Names.Contains(name))
                                {
                                    language.Names.Add(name);                                     //intentionally not lower-casing
                                }
                            }
                        }
                    }
                }
            }

            foreach (LanguageInfo languageInfo in _codeToLanguageIndex.Values)
            {
                foreach (string name in languageInfo.Names)
                {
                    GetOrCreateListFromName(name).Add(languageInfo);
                }

                if (languageInfo.Names.Count == 0)
                {
                    continue;                     // this language is suppressed
                }
                //Why just this small set? Only out of convenience. Ideally we'd have a db of all languages as they write it in their literature.
                string localName = null;
                switch (languageInfo.Names[0])
                {
                case "French":
                    localName = "français";
                    break;

                case "Spanish":
                    localName = "español";
                    break;

                case "Chinese":
                    localName = "中文";
                    break;

                case "Hindi":
                    localName = "हिन्दी";
                    break;

                case "Bengali":
                    localName = "বাংলা";
                    break;

                case "Telugu":
                    localName = "తెలుగు";
                    break;

                case "Tamil":
                    localName = "தமிழ்";
                    break;

                case "Urdu":
                    localName = "اُردُو";
                    break;

                case "Arabic":
                    localName = "العربية/عربي";
                    break;

                case "Thai":
                    localName = "ภาษาไทย";
                    break;

                case "Indonesian":
                    localName = "Bahasa Indonesia";
                    break;
                }
                if (!string.IsNullOrEmpty(localName))
                {
                    if (!languageInfo.Names.Remove(localName))
                    {
                        GetOrCreateListFromName(localName).Add(languageInfo);
                    }
                    languageInfo.Names.Insert(0, localName);
                }
            }
        }
Пример #9
0
        /// <summary>
        /// Initializes a new instance of the <see cref="LanguageDataIndex"/> class.
        /// </summary>
        public LanguageDataIndex(IDictionary <string, string> sourcefiles)
        {
            string twotothreecodes = sourcefiles["TwoToThreeCodes.txt"];
            string subtagregistry  = sourcefiles["ianaSubtagRegistry.txt"];

            StandardSubtags.InitialiseIanaSubtags(twotothreecodes, subtagregistry);

            // First read in Ethnologue data file into temporary dictionary
            var threeToTwoLetter = StandardSubtags.TwoAndThreeMap(twotothreecodes, true);

            //LanguageIndex.txt Format: LangID	CountryID	NameType	Name
            //a language appears on one row for each of its alternative langauges
            string languageindex = sourcefiles["LanguageIndex.txt"];
            var    entries       = new List <string>(languageindex.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries));

            entries.Add("qaa\t?\tL\tUnlisted Language");

            foreach (string entry in entries.Skip(1))             //skip the header
            {
                string[] items = entry.Split('\t');
                if (items.Length != 4)
                {
                    continue;
                }
                if (items[2].StartsWith("!"))                 //temporary suppression of entries while waiting for Ethnologue changes
                {
                    continue;
                }
                // excluded by !
                // all gax (ET,KE,SO) including L
                // all gaz (ET) including L
                // all hae (ET) including L

                string code = items[0].Trim();
                string twoLetterCode;
                string threelettercode = code;
                if (threeToTwoLetter.TryGetValue(code, out twoLetterCode))
                {
                    code = twoLetterCode;
                }

                //temporary suppression of entries while waiting for Ethnologue changes (those excluded by !)
                if (ExcludedCodes.Contains(code))
                {
                    continue;
                }

                string       regionCode = items[1].Trim();
                LanguageInfo language   = GetOrCreateLanguageFromCode(code, threelettercode, regionCode == "?" ? "" : StandardSubtags.RegisteredRegions[regionCode].Name);

                string name = items[3].Trim();


                if (items[2].Trim() == "L")
                {
                    while (language.Names.Contains(name))
                    {
                        language.Names.Remove(name);
                    }
                    language.Names.Insert(0, name);
                }
                else
                {
                    if (items[2].Contains("P"))
                    {
                        //Skip pejorative
                    }
                    else if (ExcludedRegions.Contains(StandardSubtags.RegisteredRegions[regionCode].Name))
                    {
                        //Skip alternatives for Ethiopia, as per request
                    }
                    else if (code == "gax" || code == "om")
                    {
                        //For these two "Oromo" languages, skip all related languages as per request
                    }
                    else if (!language.Names.Contains(name))
                    {
                        language.Names.Add(name);                         //intentionally not lower-casing
                    }
                }
            }

            // Then for each registered ietf language tag create a real entry and add the ethnologue data to it
            IOrderedEnumerable <LanguageSubtag> languages = StandardSubtags.RegisteredLanguages.OrderBy(lang => lang.Iso3Code);

            foreach (LanguageSubtag language in languages)
            {
                bool singlename = false;
                if (language.IsDeprecated || ExcludedCodes.Contains(language.Code))
                {
                    continue;
                }
                LanguageInfo langinfo = GetOrCreateLanguageFromCode(language.Code, language.Iso3Code, null);
                langinfo.DesiredName     = language.Name.Replace("'", "’");
                langinfo.IsMacroLanguage = language.IsMacroLanguage;

                foreach (string country in langinfo.Countries)
                {
                    if (ExcludedRegions.Contains(country))
                    {
                        singlename = true;
                    }
                }

                foreach (string name in language.Names)
                {
                    string langname = name.Replace("'", "’");
                    if (!langinfo.Names.Contains(langname))
                    {
                        if (singlename && langinfo.Names.Count == 1)
                        {
                            // leave single ethnologue names
                            break;
                        }
                        else
                        {
                            langinfo.Names.Add(langname);
                        }
                    }
                    if (singlename)
                    {
                        break;
                    }
                }
                _codeToLanguageIndex.Add(language.Code, langinfo);
            }

            IEnumerable <IGrouping <string, string> > languageGroups = Sldr.LanguageTags.Where(info => info.IsAvailable && IetfLanguageTag.IsValid(info.LanguageTag))
                                                                       .Select(info => IetfLanguageTag.Canonicalize(info.LanguageTag))
                                                                       .GroupBy(IetfLanguageTag.GetLanguagePart);

            foreach (IGrouping <string, string> languageGroup in languageGroups)
            {
                string[] langTags = languageGroup.ToArray();
                if (langTags.Length == 1)
                {
                    string       langTag = langTags[0];
                    LanguageInfo language;
                    if (langTag != languageGroup.Key && _codeToLanguageIndex.TryGetValue(languageGroup.Key, out language))
                    {
                        _codeToLanguageIndex.Remove(languageGroup.Key);
                        language.LanguageTag          = langTag;
                        _codeToLanguageIndex[langTag] = language;
                    }
                }
                else
                {
                    foreach (string langTag in langTags)
                    {
                        LanguageSubtag languageSubtag;
                        ScriptSubtag   scriptSubtag;
                        RegionSubtag   regionSubtag;
                        IEnumerable <VariantSubtag> variantSubtags;
                        if (IetfLanguageTag.TryGetSubtags(langTag, out languageSubtag, out scriptSubtag, out regionSubtag, out variantSubtags))
                        {
                            if (langTag == languageSubtag)
                            {
                                continue;
                            }

                            LanguageInfo language      = GetOrCreateLanguageFromCode(langTag, langTag, regionSubtag == null ? "" : regionSubtag.Name);                        // changed to default to "" 2017-04-24
                            bool         displayScript = scriptSubtag != null && !IetfLanguageTag.IsScriptImplied(langTag);
                            LanguageInfo otherLanguage;
                            if (langTag != languageSubtag && !displayScript && _codeToLanguageIndex.TryGetValue(languageSubtag, out otherLanguage) && language.Countries.SetEquals(otherLanguage.Countries))
                            {
                                language.Names.AddRange(otherLanguage.Names);
                            }
                            else
                            {
                                string name = displayScript ? string.Format("{0} ({1})", languageSubtag.Name, scriptSubtag.Name) : languageSubtag.Name;
                                if (!language.Names.Contains(name))
                                {
                                    language.Names.Add(name);                                     //intentionally not lower-casing
                                }
                            }
                            LanguageInfo keylanguage;
                            if (_codeToLanguageIndex.TryGetValue(languageGroup.Key, out keylanguage))
                            {
                                language.IsMacroLanguage = keylanguage.IsMacroLanguage;
                            }
                            _codeToLanguageIndex.Add(langTag, language);
                        }
                    }
                }
            }

            string languagecodes = sourcefiles["LanguageCodes.txt"];
            var    codeentries   = new List <string>(languagecodes.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries));

            foreach (var languageCode in codeentries)
            {
                var data = languageCode.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
                if (data.Length < 2)
                {
                    continue;
                }
                var    langCode = data[0];
                string twoLetterCode;
                if (threeToTwoLetter.TryGetValue(langCode, out twoLetterCode))
                {
                    langCode = twoLetterCode;
                }
                if (langCode == "fuv")
                {
                    langCode = "fuv-Arab";                      // special case because the script has been added to this language code
                }
                // which is probably something to do with the SLDR
                var          countryCode = data[1];
                LanguageInfo lang;
                if (_codeToLanguageIndex.TryGetValue(langCode, out lang))
                {
                    lang.PrimaryCountry = StandardSubtags.RegisteredRegions[countryCode].Name;
                }
            }

            // localise some language names
            foreach (LanguageInfo languageInfo in _codeToLanguageIndex.Values)
            {
                if (languageInfo.Names.Count == 0)
                {
                    continue;                     // this language is suppressed
                }
                //Why just this small set? Only out of convenience. Ideally we'd have a db of all languages as they write it in their literature.
                string localName = null;
                switch (languageInfo.Names[0])
                {
                case "French":
                    localName = "français";
                    break;

                case "Spanish":
                    localName = "español";
                    break;

                case "Chinese":
                    localName = "中文";
                    break;

                case "Hindi":
                    localName = "हिन्दी";
                    break;

                case "Bengali":
                    localName = "বাংলা";
                    break;

                case "Telugu":
                    localName = "తెలుగు";
                    break;

                case "Tamil":
                    localName = "தமிழ்";
                    break;

                case "Urdu":
                    localName = "اُردُو";
                    break;

                case "Arabic":
                    localName = "العربية/عربي";
                    break;

                case "Thai":
                    localName = "ภาษาไทย";
                    break;

                case "Indonesian":
                    localName = "Bahasa Indonesia";
                    break;
                }
                if (!string.IsNullOrEmpty(localName))
                {
                    if (languageInfo.Names.Contains(localName))
                    {
                        languageInfo.Names.Remove(localName);
                    }
                    languageInfo.Names.Insert(0, localName);
                    languageInfo.DesiredName = localName;
                }

                switch (languageInfo.ThreeLetterTag)
                {
                case "itd":                         // 2 temporary special cases because the LanguageCodes.txt files needs to be updated with LanguageIndex.txt
                    languageInfo.PrimaryCountry = "Indonesia";
                    break;

                case "xak":
                    languageInfo.PrimaryCountry = "Venezuela";
                    break;

                default:
                    // Also set the PrimaryCountry if there is only one country
                    if (String.IsNullOrEmpty(languageInfo.PrimaryCountry) && languageInfo.Countries.Count == 1)
                    {
                        languageInfo.PrimaryCountry = languageInfo.Countries.First();
                    }
                    break;
                }
            }

            // check if any languages are found in multiple countries but do not have a primary country
            // there is a test for this in LanguageLookupTests.llExpectedLanguagesHaveUniquePrimaryCountries
            var languagesWithoutRegions = new List <LanguageInfo>();

            foreach (var lang in _codeToLanguageIndex.Values)
            {
                if (String.IsNullOrEmpty(lang.PrimaryCountry))
                {
                    languagesWithoutRegions.Add(lang);
                }
            }
            var languagesWithAmbiguousPrimaryCountry = languagesWithoutRegions.Where(l => l.Countries.Count() > 1);

            foreach (var lang in languagesWithAmbiguousPrimaryCountry)
            {
                Console.WriteLine("Language {0}({1}) has no primary country but is found in multiple countries", lang.DesiredName, lang.LanguageTag);
            }
        }