public void Canonicalize_ImplicitScript_SuppressesScript() { Assert.That(IetfLanguageTag.Canonicalize("en-Latn-US"), Is.EqualTo("en-US")); Assert.That(IetfLanguageTag.Canonicalize("zh-hans-Cn-x-stuff"), Is.EqualTo("zh-CN-x-stuff")); Assert.That(IetfLanguageTag.Canonicalize("zH-hans-Cn"), Is.EqualTo("zh-CN")); Assert.That(IetfLanguageTag.Canonicalize("zH-Hant-tW-x-stuff"), Is.EqualTo("zh-TW-x-stuff")); Assert.That(IetfLanguageTag.Canonicalize("Zh-hant-Tw"), Is.EqualTo("zh-TW")); Assert.That(IetfLanguageTag.Canonicalize("oro-Latn"), Is.EqualTo("oro")); }
private bool TryGetNewLangTag(string oldTag, out string newTag) { if (m_tagMap.TryGetValue(oldTag, out newTag)) { return(!newTag.Equals(oldTag, StringComparison.InvariantCultureIgnoreCase)); } var cleaner = new IetfLanguageTagCleaner(oldTag); cleaner.Clean(); newTag = cleaner.GetCompleteTag(); while (m_tagMap.Values.Contains(newTag, StringComparer.InvariantCultureIgnoreCase)) { // We can't use this tag because it would conflict with what we are mapping something else to. cleaner = new IetfLanguageTagCleaner(cleaner.Language, cleaner.Script, cleaner.Region, cleaner.Variant, WritingSystemIdMigrator.GetNextDuplPart(cleaner.PrivateUse)); newTag = cleaner.GetCompleteTag(); } newTag = IetfLanguageTag.Canonicalize(newTag); m_tagMap[oldTag] = newTag; return(!newTag.Equals(oldTag, StringComparison.InvariantCultureIgnoreCase)); }
/// <summary> /// API to query the SLDR for an LDML file and save it locally in the SLDR cache and specified directories /// </summary> /// <param name="destinationPath">Destination path to save the requested LDML file</param> /// <param name="languageTag">Current IETF language tag</param> /// <param name="topLevelElements">List of top level element names to request. SLDR will always publish identity, so it doesn't need to be requested. /// If null, the entire LDML file will be requested.</param> /// <param name="filename">Saved filename</param> /// <returns>Enum status SldrStatus if file could be retrieved and the source</returns> public static SldrStatus GetLdmlFile(string destinationPath, string languageTag, IEnumerable <string> topLevelElements, out string filename) { CheckInitialized(); if (String.IsNullOrEmpty(destinationPath)) { throw new ArgumentException("destinationPath"); } if (!Directory.Exists(destinationPath)) { throw new DirectoryNotFoundException("destinationPath"); } if (String.IsNullOrEmpty(languageTag) || (!IetfLanguageTag.IsValid(languageTag))) { throw new ArgumentException("ietfLanguageTag"); } if (topLevelElements == null) { throw new ArgumentNullException("topLevelElements"); } string sldrLanguageTag = IetfLanguageTag.Canonicalize(languageTag); SldrLanguageTagInfo langTagInfo; if (LanguageTags.TryGet(sldrLanguageTag, out langTagInfo)) { sldrLanguageTag = langTagInfo.SldrLanguageTag; } string[] topLevelElementsArray = topLevelElements.ToArray(); using (_sldrCacheMutex.Lock()) { var status = SldrStatus.NotFound; CreateSldrCacheDirectory(); string sldrCacheFilePath; bool redirected; do { string revid, uid = "", tempString; if (destinationPath == SldrCachePath) { filename = string.Format("{0}.{1}", sldrLanguageTag, LdmlExtension); } else { filename = string.Format("{0}.{1}", languageTag, LdmlExtension); // Check if LDML file already exists in destination and read revid and uid if (!ReadSilIdentity(Path.Combine(destinationPath, filename), out tempString, out uid)) { uid = DefaultUserId; } } // If languageTag contains fonipa, don't bother trying to access the SLDR if (sldrLanguageTag.Contains(WellKnownSubtags.IpaVariant) || sldrLanguageTag.Contains(WellKnownSubtags.AudioScript)) { return(SldrStatus.NotFound); } sldrCacheFilePath = Path.Combine(SldrCachePath, !string.IsNullOrEmpty(uid) && uid != DefaultUserId ? string.Format("{0}-{1}.{2}", sldrLanguageTag, uid, LdmlExtension) : string.Format("{0}.{1}", sldrLanguageTag, LdmlExtension)); // Read revid from cache file ReadSilIdentity(sldrCacheFilePath, out revid, out tempString); // Concatenate parameters for url string string requestedElements = string.Empty; if (topLevelElementsArray.Length > 0) { requestedElements = string.Format("&inc[]={0}", string.Join("&inc[]=", topLevelElementsArray)); } string requestedUserId = !string.IsNullOrEmpty(uid) ? string.Format("&uid={0}", uid) : string.Empty; string requestedRevid = !string.IsNullOrEmpty(revid) ? string.Format("&revid={0}", revid) : string.Empty; string url = string.Format("{0}{1}?ext={2}&flatten=1{3}{4}{5}", SldrRepository, sldrLanguageTag, LdmlExtension, requestedElements, requestedUserId, requestedRevid); string tempFilePath = sldrCacheFilePath + "." + TmpExtension; // Using WebRequest instead of WebClient so we have access to disable AllowAutoRedirect var webRequest = (HttpWebRequest)WebRequest.Create(Uri.EscapeUriString(url)); webRequest.AllowAutoRedirect = false; webRequest.UserAgent = UserAgent; webRequest.Timeout = 10000; try { if (_offlineMode) { throw new WebException("Test mode: SLDR offline so accessing cache", WebExceptionStatus.ConnectFailure); } // Check the response header to see if the requested LDML file got redirected using (var webResponse = (HttpWebResponse)webRequest.GetResponse()) { if (webResponse.StatusCode == HttpStatusCode.NotModified) { // Report status that file is the most current from SLDR status = SldrStatus.FromSldr; redirected = false; } else if (webResponse.StatusCode == HttpStatusCode.MovedPermanently) { // Extract ietfLanguageTag from the response header var parsedresponse = HttpUtilityFromMono.ParseQueryString(webResponse.Headers["Location"]); sldrLanguageTag = parsedresponse.Get("ws_id").Split('?')[0]; redirected = true; } else { // Download the LDML file to a temp file in case the transfer gets interrupted using (Stream responseStream = webResponse.GetResponseStream()) using (var fs = new FileStream(tempFilePath, FileMode.OpenOrCreate, FileAccess.Write)) { var buff = new byte[102400]; int c; while ((c = responseStream.Read(buff, 0, buff.Length)) > 0) { fs.Write(buff, 0, c); fs.Flush(); } } status = SldrStatus.FromSldr; sldrCacheFilePath = MoveTmpToCache(tempFilePath, uid); redirected = false; } } } catch (WebException we) { // Return from 404 error var errorResponse = (HttpWebResponse)we.Response; if ((we.Status == WebExceptionStatus.ProtocolError) && (errorResponse.StatusCode == HttpStatusCode.NotFound)) { return(SldrStatus.NotFound); } string sldrCacheFilename; // Download failed so check SLDR cache if (!string.IsNullOrEmpty(uid) && (uid != DefaultUserId)) { sldrCacheFilename = string.Format("{0}-{1}.{2}", sldrLanguageTag, uid, LdmlExtension); } else { sldrCacheFilename = string.Format("{0}.{1}", sldrLanguageTag, LdmlExtension); } sldrCacheFilePath = Path.Combine(SldrCachePath, sldrCacheFilename); if (File.Exists(sldrCacheFilePath)) { status = SldrStatus.FromCache; } else { return(SldrStatus.UnableToConnectToSldr); } redirected = false; } finally { if (File.Exists(tempFilePath)) { File.Delete(tempFilePath); } } } while (redirected); if (destinationPath != SldrCachePath) { // Copy from Cache to destination (w/o uid in filename), overwriting whatever used to be there File.Copy(sldrCacheFilePath, Path.Combine(destinationPath, filename), true); } return(status); } }
public override void Migrate(string sourceFilePath, string destinationFilePath) { string sourceFileName = Path.GetFileName(sourceFilePath); var writingSystemDefinitionV1 = new WritingSystemDefinitionV1(); new LdmlAdaptorV1().Read(sourceFilePath, writingSystemDefinitionV1); string abbreviation = writingSystemDefinitionV1.Abbreviation; float defaultFontSize = writingSystemDefinitionV1.DefaultFontSize; string keyboard = writingSystemDefinitionV1.Keyboard; string spellCheckingId = writingSystemDefinitionV1.SpellCheckingId; string defaultFontName = writingSystemDefinitionV1.DefaultFontName; string languageName = writingSystemDefinitionV1.LanguageName.IsOneOf("Unknown Language", "Language Not Listed") ? string.Empty : writingSystemDefinitionV1.LanguageName; string variant, privateUse; IetfLanguageTag.SplitVariantAndPrivateUse(writingSystemDefinitionV1.Variant, out variant, out privateUse); var langTagCleaner = new IetfLanguageTagCleaner(writingSystemDefinitionV1.Language, writingSystemDefinitionV1.Script, writingSystemDefinitionV1.Region, variant, privateUse); langTagCleaner.Clean(); string langTag = IetfLanguageTag.Canonicalize(langTagCleaner.GetCompleteTag()); List <string> knownKeyboards = writingSystemDefinitionV1.KnownKeyboards.Select(k => string.IsNullOrEmpty(k.Locale) ? k.Layout : string.Format("{0}_{1}", k.Locale, k.Layout)).ToList(); bool isGraphiteEnabled = false; string legacyMapping = string.Empty; string scriptName = string.Empty; string regionName = string.Empty; string variantName = string.Empty; SystemCollationDefinition scd = null; // Create system collation definition if applicable if ((writingSystemDefinitionV1.SortUsing == WritingSystemDefinitionV1.SortRulesType.OtherLanguage) && (!string.IsNullOrEmpty(writingSystemDefinitionV1.SortRules))) { scd = new SystemCollationDefinition { LanguageTag = writingSystemDefinitionV1.SortRules } } ; // Migrate fields from legacy fw namespace, and then remove fw namespace XElement ldmlElem = XElement.Load(sourceFilePath); XElement fwElem = ldmlElem.Elements("special").FirstOrDefault(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "fw"))); if (fwElem != null) { XElement graphiteEnabledElem = fwElem.Element(FW + "graphiteEnabled"); if (graphiteEnabledElem != null) { if (!bool.TryParse((string)graphiteEnabledElem.Attribute("value"), out isGraphiteEnabled)) { isGraphiteEnabled = false; } } // LegacyMapping XElement legacyMappingElem = fwElem.Element(FW + "legacyMapping"); if (legacyMappingElem != null) { legacyMapping = (string)legacyMappingElem.Attribute("value"); } // ScriptName XElement scriptNameElem = fwElem.Element(FW + "scriptName"); if (scriptNameElem != null) { scriptName = (string)scriptNameElem.Attribute("value"); } // RegionName XElement regionNameElem = fwElem.Element(FW + "regionName"); if (regionNameElem != null) { regionName = (string)regionNameElem.Attribute("value"); } // VariantName XElement variantNameElem = fwElem.Element(FW + "variantName"); if (variantNameElem != null) { variantName = (string)variantNameElem.Attribute("value"); } } // Record the details for use in PostMigrate where we change the file name to match the ieft language tag where we can. var migrationInfo = new LdmlMigrationInfo(sourceFileName) { LanguageTagBeforeMigration = writingSystemDefinitionV1.Bcp47Tag, LanguageTagAfterMigration = langTag, RemovedPropertiesSetter = ws => { if (!string.IsNullOrEmpty(abbreviation)) { ws.Abbreviation = abbreviation; } if (defaultFontSize != 0) { ws.DefaultFontSize = defaultFontSize; } if (!string.IsNullOrEmpty(keyboard)) { ws.Keyboard = keyboard; } if (!string.IsNullOrEmpty(spellCheckingId)) { ws.SpellCheckingId = spellCheckingId; } if (!string.IsNullOrEmpty(defaultFontName)) { ws.DefaultFont = ws.Fonts[defaultFontName]; } if (!string.IsNullOrEmpty(languageName)) { ws.Language = new LanguageSubtag(ws.Language, languageName); } ws.IsGraphiteEnabled = isGraphiteEnabled; if (!string.IsNullOrEmpty(legacyMapping)) { ws.LegacyMapping = legacyMapping; } if (!string.IsNullOrEmpty(scriptName) && ws.Script != null && ws.Script.IsPrivateUse) { ws.Script = new ScriptSubtag(ws.Script, scriptName); } if (!string.IsNullOrEmpty(regionName) && ws.Region != null && ws.Region.IsPrivateUse) { ws.Region = new RegionSubtag(ws.Region, regionName); } if (scd != null) { ws.DefaultCollation = scd; } foreach (string keyboardId in knownKeyboards) { IKeyboardDefinition kd; if (!Keyboard.Controller.TryGetKeyboard(keyboardId, out kd)) { kd = Keyboard.Controller.CreateKeyboard(keyboardId, KeyboardFormat.Unknown, Enumerable.Empty <string>()); } ws.KnownKeyboards.Add(kd); } } }; _migrationInfo.Add(migrationInfo); // Store things that stay in ldml but are being moved: WindowsLcid, variantName, font, known keyboards, collations, font features, character sets // misc properties var staging = new Staging { WindowsLcid = writingSystemDefinitionV1.WindowsLcid, DefaultFontName = writingSystemDefinitionV1.DefaultFontName, SortUsing = writingSystemDefinitionV1.SortUsing, SortRules = writingSystemDefinitionV1.SortRules, }; // Determine if variantName is non-common private use before preserving it if (!string.IsNullOrEmpty(variantName)) { int index = IetfLanguageTag.GetIndexOfFirstNonCommonPrivateUseVariant(IetfLanguageTag.GetVariantSubtags(migrationInfo.LanguageTagAfterMigration)); if (index > -1) { staging.VariantName = variantName; } } if (fwElem != null) { // DefaultFontFeatures XElement fontFeatsElem = fwElem.Element(FW + "defaultFontFeatures"); if (fontFeatsElem != null && !string.IsNullOrEmpty(staging.DefaultFontName)) { staging.DefaultFontFeatures = (string)fontFeatsElem.Attribute("value"); } //MatchedPairs, PunctuationPatterns, QuotationMarks deprecated // Valid Chars XElement validCharsElem = fwElem.Element(FW + "validChars"); if (validCharsElem != null) { try { var fwValidCharsElem = XElement.Parse((string)validCharsElem.Attribute("value")); AddCharacterSet(fwValidCharsElem, staging, "WordForming", "main"); AddCharacterSet(fwValidCharsElem, staging, "Numeric", "numeric"); AddCharacterSet(fwValidCharsElem, staging, "Other", "punctuation"); } catch (XmlException) { ParseLegacyWordformingCharOverridesFile(staging); } } } _staging[sourceFileName] = staging; }
public override void PostMigrate(string sourcePath, string destinationPath) { EnsureIeftLanguageTagsUnique(_migrationInfo); // Write them back, with their new file name. foreach (LdmlMigrationInfo migrationInfo in _migrationInfo) { Staging staging = _staging[migrationInfo.FileName]; string sourceFilePath = Path.Combine(sourcePath, migrationInfo.FileName); string destinationFilePath = Path.Combine(destinationPath, migrationInfo.LanguageTagAfterMigration + ".ldml"); XElement ldmlElem = XElement.Load(sourceFilePath); // Remove legacy palaso namespace from sourceFilePath ldmlElem.Elements("special").Where(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "palaso"))).Remove(); ldmlElem.Elements("special").Where(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "palaso2"))).Remove(); ldmlElem.Elements("special").Where(e => !string.IsNullOrEmpty((string)e.Attribute(XNamespace.Xmlns + "fw"))).Remove(); // Remove collations to repopulate later ldmlElem.Elements("collations").Remove(); // Write out the elements. XElement identityElem = ldmlElem.Element("identity"); WriteIdentityElement(identityElem, staging, IetfLanguageTag.Canonicalize(migrationInfo.LanguageTagAfterMigration)); var layoutElement = ldmlElem.Element("layout"); WriteLayoutElement(layoutElement); if (staging.CharacterSets.ContainsKey("main") || staging.CharacterSets.ContainsKey("punctuation")) { XElement charactersElem = ldmlElem.GetOrCreateElement("characters"); WriteCharactersElement(charactersElem, staging); } if (staging.CharacterSets.ContainsKey("numeric")) { XElement numbersElem = ldmlElem.GetOrCreateElement("numbers"); WriteNumbersElement(numbersElem, staging); } if (staging.SortUsing != WritingSystemDefinitionV1.SortRulesType.OtherLanguage) { XElement collationsElem = ldmlElem.GetOrCreateElement("collations"); WriteCollationsElement(collationsElem, staging); } // If needed, create top level special for external resources if (!string.IsNullOrEmpty(staging.DefaultFontName)) { // Create special element XElement specialElem = CreateSpecialElement(ldmlElem); WriteTopLevelSpecialElements(specialElem, staging); } var writerSettings = CanonicalXmlSettings.CreateXmlWriterSettings(); writerSettings.NewLineOnAttributes = false; using (var writer = XmlWriter.Create(destinationFilePath, writerSettings)) ldmlElem.WriteTo(writer); if (migrationInfo.LanguageTagBeforeMigration != migrationInfo.LanguageTagAfterMigration) { _auditLog.LogChange(migrationInfo.LanguageTagBeforeMigration, migrationInfo.LanguageTagAfterMigration); } } if (_migrationHandler != null) { _migrationHandler(ToVersion, _migrationInfo); } }
public void Canonicalize_NonImplicitScript_DoesNotSuppressScript() { Assert.That(IetfLanguageTag.Canonicalize("en-Cyrl-US"), Is.EqualTo("en-Cyrl-US")); Assert.That(IetfLanguageTag.Canonicalize("sr-Latn"), Is.EqualTo("sr-Latn")); }
public void Canonicalize_NonStandardCapitalization_StandardCapitalization() { Assert.That(IetfLanguageTag.Canonicalize("zH-latn-cn-FonIpa-X-Etic"), Is.EqualTo("zh-Latn-CN-fonipa-x-etic")); }
/// <summary> /// Initializes a new instance of the <see cref="LanguageLookup"/> class. /// </summary> public LanguageLookup() { var threeToTwoLetter = new Dictionary <string, string>(); foreach (string line in LanguageRegistryResources.TwoToThreeCodes.Replace("\r\n", "\n").Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries)) { string[] items = line.Split('\t'); threeToTwoLetter.Add(items[1].Trim(), items[0].Trim()); } //LanguageIndex.txt Format: LangID CountryID NameType Name //a language appears on one row for each of its alternative langauges var entries = new List <string>(LanguageRegistryResources.LanguageIndex.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries)); entries.Add("qaa\t?\tL\tUnlisted Language"); foreach (string entry in entries.Skip(1)) //skip the header { string[] items = entry.Split('\t'); if (items.Length != 4) { continue; } if (items[2].Contains('!')) //temporary suppression of entries while waiting for Ethnologue changes { continue; } string code = items[0].Trim(); string twoLetterCode; if (threeToTwoLetter.TryGetValue(code, out twoLetterCode)) { code = twoLetterCode; } string regionCode = items[1].Trim(); LanguageInfo language = GetOrCreateLanguageFromCode(code, regionCode == "?" ? "?" : StandardSubtags.RegisteredRegions[regionCode].Name); string name = items[3].Trim(); if (items[2] == "L") { while (language.Names.Contains(name)) { language.Names.Remove(name); } language.Names.Insert(0, name); } else { if (items[2].Contains("P")) { //Skip pejorative } else if (items[1] == ("ET")) { //Skip alternatives for Ethiopia, as per request } else if (items[0] == "gax" || items[0] == "om") { //For these two "Oromo" languages, skip all related languages as per request } else if (!language.Names.Contains(name)) { language.Names.Add(name); //intentionally not lower-casing } } } IEnumerable <IGrouping <string, string> > languageGroups = Sldr.LanguageTags.Where(info => info.IsAvailable && IetfLanguageTag.IsValid(info.LanguageTag)) .Select(info => IetfLanguageTag.Canonicalize(info.LanguageTag)) .GroupBy(IetfLanguageTag.GetLanguagePart); foreach (IGrouping <string, string> languageGroup in languageGroups) { string[] langTags = languageGroup.ToArray(); if (langTags.Length == 1) { string langTag = langTags[0]; LanguageInfo language; if (langTag != languageGroup.Key && _codeToLanguageIndex.TryGetValue(languageGroup.Key, out language)) { _codeToLanguageIndex.Remove(languageGroup.Key); language.LanguageTag = langTag; _codeToLanguageIndex[langTag] = language; } } else { foreach (string langTag in langTags) { LanguageSubtag languageSubtag; ScriptSubtag scriptSubtag; RegionSubtag regionSubtag; IEnumerable <VariantSubtag> variantSubtags; if (IetfLanguageTag.TryGetSubtags(langTag, out languageSubtag, out scriptSubtag, out regionSubtag, out variantSubtags)) { if (langTag == languageSubtag) { continue; } LanguageInfo language = GetOrCreateLanguageFromCode(langTag, regionSubtag == null ? "?" : regionSubtag.Name); bool displayScript = scriptSubtag != null && !IetfLanguageTag.IsScriptImplied(langTag); LanguageInfo otherLanguage; if (langTag != languageSubtag && !displayScript && _codeToLanguageIndex.TryGetValue(languageSubtag, out otherLanguage) && language.Countries.SetEquals(otherLanguage.Countries)) { language.Names.AddRange(otherLanguage.Names); } else { string name = displayScript ? string.Format("{0} ({1})", languageSubtag.Name, scriptSubtag.Name) : languageSubtag.Name; if (!language.Names.Contains(name)) { language.Names.Add(name); //intentionally not lower-casing } } } } } } foreach (LanguageInfo languageInfo in _codeToLanguageIndex.Values) { foreach (string name in languageInfo.Names) { GetOrCreateListFromName(name).Add(languageInfo); } if (languageInfo.Names.Count == 0) { continue; // this language is suppressed } //Why just this small set? Only out of convenience. Ideally we'd have a db of all languages as they write it in their literature. string localName = null; switch (languageInfo.Names[0]) { case "French": localName = "français"; break; case "Spanish": localName = "español"; break; case "Chinese": localName = "中文"; break; case "Hindi": localName = "हिन्दी"; break; case "Bengali": localName = "বাংলা"; break; case "Telugu": localName = "తెలుగు"; break; case "Tamil": localName = "தமிழ்"; break; case "Urdu": localName = "اُردُو"; break; case "Arabic": localName = "العربية/عربي"; break; case "Thai": localName = "ภาษาไทย"; break; case "Indonesian": localName = "Bahasa Indonesia"; break; } if (!string.IsNullOrEmpty(localName)) { if (!languageInfo.Names.Remove(localName)) { GetOrCreateListFromName(localName).Add(languageInfo); } languageInfo.Names.Insert(0, localName); } } }
/// <summary> /// Initializes a new instance of the <see cref="LanguageDataIndex"/> class. /// </summary> public LanguageDataIndex(IDictionary <string, string> sourcefiles) { string twotothreecodes = sourcefiles["TwoToThreeCodes.txt"]; string subtagregistry = sourcefiles["ianaSubtagRegistry.txt"]; StandardSubtags.InitialiseIanaSubtags(twotothreecodes, subtagregistry); // First read in Ethnologue data file into temporary dictionary var threeToTwoLetter = StandardSubtags.TwoAndThreeMap(twotothreecodes, true); //LanguageIndex.txt Format: LangID CountryID NameType Name //a language appears on one row for each of its alternative langauges string languageindex = sourcefiles["LanguageIndex.txt"]; var entries = new List <string>(languageindex.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries)); entries.Add("qaa\t?\tL\tUnlisted Language"); foreach (string entry in entries.Skip(1)) //skip the header { string[] items = entry.Split('\t'); if (items.Length != 4) { continue; } if (items[2].StartsWith("!")) //temporary suppression of entries while waiting for Ethnologue changes { continue; } // excluded by ! // all gax (ET,KE,SO) including L // all gaz (ET) including L // all hae (ET) including L string code = items[0].Trim(); string twoLetterCode; string threelettercode = code; if (threeToTwoLetter.TryGetValue(code, out twoLetterCode)) { code = twoLetterCode; } //temporary suppression of entries while waiting for Ethnologue changes (those excluded by !) if (ExcludedCodes.Contains(code)) { continue; } string regionCode = items[1].Trim(); LanguageInfo language = GetOrCreateLanguageFromCode(code, threelettercode, regionCode == "?" ? "" : StandardSubtags.RegisteredRegions[regionCode].Name); string name = items[3].Trim(); if (items[2].Trim() == "L") { while (language.Names.Contains(name)) { language.Names.Remove(name); } language.Names.Insert(0, name); } else { if (items[2].Contains("P")) { //Skip pejorative } else if (ExcludedRegions.Contains(StandardSubtags.RegisteredRegions[regionCode].Name)) { //Skip alternatives for Ethiopia, as per request } else if (code == "gax" || code == "om") { //For these two "Oromo" languages, skip all related languages as per request } else if (!language.Names.Contains(name)) { language.Names.Add(name); //intentionally not lower-casing } } } // Then for each registered ietf language tag create a real entry and add the ethnologue data to it IOrderedEnumerable <LanguageSubtag> languages = StandardSubtags.RegisteredLanguages.OrderBy(lang => lang.Iso3Code); foreach (LanguageSubtag language in languages) { bool singlename = false; if (language.IsDeprecated || ExcludedCodes.Contains(language.Code)) { continue; } LanguageInfo langinfo = GetOrCreateLanguageFromCode(language.Code, language.Iso3Code, null); langinfo.DesiredName = language.Name.Replace("'", "’"); langinfo.IsMacroLanguage = language.IsMacroLanguage; foreach (string country in langinfo.Countries) { if (ExcludedRegions.Contains(country)) { singlename = true; } } foreach (string name in language.Names) { string langname = name.Replace("'", "’"); if (!langinfo.Names.Contains(langname)) { if (singlename && langinfo.Names.Count == 1) { // leave single ethnologue names break; } else { langinfo.Names.Add(langname); } } if (singlename) { break; } } _codeToLanguageIndex.Add(language.Code, langinfo); } IEnumerable <IGrouping <string, string> > languageGroups = Sldr.LanguageTags.Where(info => info.IsAvailable && IetfLanguageTag.IsValid(info.LanguageTag)) .Select(info => IetfLanguageTag.Canonicalize(info.LanguageTag)) .GroupBy(IetfLanguageTag.GetLanguagePart); foreach (IGrouping <string, string> languageGroup in languageGroups) { string[] langTags = languageGroup.ToArray(); if (langTags.Length == 1) { string langTag = langTags[0]; LanguageInfo language; if (langTag != languageGroup.Key && _codeToLanguageIndex.TryGetValue(languageGroup.Key, out language)) { _codeToLanguageIndex.Remove(languageGroup.Key); language.LanguageTag = langTag; _codeToLanguageIndex[langTag] = language; } } else { foreach (string langTag in langTags) { LanguageSubtag languageSubtag; ScriptSubtag scriptSubtag; RegionSubtag regionSubtag; IEnumerable <VariantSubtag> variantSubtags; if (IetfLanguageTag.TryGetSubtags(langTag, out languageSubtag, out scriptSubtag, out regionSubtag, out variantSubtags)) { if (langTag == languageSubtag) { continue; } LanguageInfo language = GetOrCreateLanguageFromCode(langTag, langTag, regionSubtag == null ? "" : regionSubtag.Name); // changed to default to "" 2017-04-24 bool displayScript = scriptSubtag != null && !IetfLanguageTag.IsScriptImplied(langTag); LanguageInfo otherLanguage; if (langTag != languageSubtag && !displayScript && _codeToLanguageIndex.TryGetValue(languageSubtag, out otherLanguage) && language.Countries.SetEquals(otherLanguage.Countries)) { language.Names.AddRange(otherLanguage.Names); } else { string name = displayScript ? string.Format("{0} ({1})", languageSubtag.Name, scriptSubtag.Name) : languageSubtag.Name; if (!language.Names.Contains(name)) { language.Names.Add(name); //intentionally not lower-casing } } LanguageInfo keylanguage; if (_codeToLanguageIndex.TryGetValue(languageGroup.Key, out keylanguage)) { language.IsMacroLanguage = keylanguage.IsMacroLanguage; } _codeToLanguageIndex.Add(langTag, language); } } } } string languagecodes = sourcefiles["LanguageCodes.txt"]; var codeentries = new List <string>(languagecodes.Split(new[] { "\n" }, StringSplitOptions.RemoveEmptyEntries)); foreach (var languageCode in codeentries) { var data = languageCode.Split(new[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (data.Length < 2) { continue; } var langCode = data[0]; string twoLetterCode; if (threeToTwoLetter.TryGetValue(langCode, out twoLetterCode)) { langCode = twoLetterCode; } if (langCode == "fuv") { langCode = "fuv-Arab"; // special case because the script has been added to this language code } // which is probably something to do with the SLDR var countryCode = data[1]; LanguageInfo lang; if (_codeToLanguageIndex.TryGetValue(langCode, out lang)) { lang.PrimaryCountry = StandardSubtags.RegisteredRegions[countryCode].Name; } } // localise some language names foreach (LanguageInfo languageInfo in _codeToLanguageIndex.Values) { if (languageInfo.Names.Count == 0) { continue; // this language is suppressed } //Why just this small set? Only out of convenience. Ideally we'd have a db of all languages as they write it in their literature. string localName = null; switch (languageInfo.Names[0]) { case "French": localName = "français"; break; case "Spanish": localName = "español"; break; case "Chinese": localName = "中文"; break; case "Hindi": localName = "हिन्दी"; break; case "Bengali": localName = "বাংলা"; break; case "Telugu": localName = "తెలుగు"; break; case "Tamil": localName = "தமிழ்"; break; case "Urdu": localName = "اُردُو"; break; case "Arabic": localName = "العربية/عربي"; break; case "Thai": localName = "ภาษาไทย"; break; case "Indonesian": localName = "Bahasa Indonesia"; break; } if (!string.IsNullOrEmpty(localName)) { if (languageInfo.Names.Contains(localName)) { languageInfo.Names.Remove(localName); } languageInfo.Names.Insert(0, localName); languageInfo.DesiredName = localName; } switch (languageInfo.ThreeLetterTag) { case "itd": // 2 temporary special cases because the LanguageCodes.txt files needs to be updated with LanguageIndex.txt languageInfo.PrimaryCountry = "Indonesia"; break; case "xak": languageInfo.PrimaryCountry = "Venezuela"; break; default: // Also set the PrimaryCountry if there is only one country if (String.IsNullOrEmpty(languageInfo.PrimaryCountry) && languageInfo.Countries.Count == 1) { languageInfo.PrimaryCountry = languageInfo.Countries.First(); } break; } } // check if any languages are found in multiple countries but do not have a primary country // there is a test for this in LanguageLookupTests.llExpectedLanguagesHaveUniquePrimaryCountries var languagesWithoutRegions = new List <LanguageInfo>(); foreach (var lang in _codeToLanguageIndex.Values) { if (String.IsNullOrEmpty(lang.PrimaryCountry)) { languagesWithoutRegions.Add(lang); } } var languagesWithAmbiguousPrimaryCountry = languagesWithoutRegions.Where(l => l.Countries.Count() > 1); foreach (var lang in languagesWithAmbiguousPrimaryCountry) { Console.WriteLine("Language {0}({1}) has no primary country but is found in multiple countries", lang.DesiredName, lang.LanguageTag); } }