private static void GetUniqueAccentCodes(int modelVersion) { var converter = new Utility.VietConverter(); Dictionary <char, char> accentToCodeMap = converter.AccentToCodeMap; List <ILanguageModel> models = Trainer.LoadModel(Directory.GetFiles(basePath, "ngram*"), modelVersion); for (int i = 0; i < models.Count; i++) { var uniqueCodes = new SortedList <string, int>(); Model0 ni = models[i] as Model0; IEnumerable <string> codes = ni.Map.Select(he => he.Value .Select(acc => String.Join("", acc.Key.Select(c => accentToCodeMap[c])))) .SelectMany(s => s); foreach (string code in codes) { if (!uniqueCodes.ContainsKey(code)) { uniqueCodes.Add(code, 0); } } File.WriteAllText("uniquecodes_" + (i + 1), uniqueCodes.Count + "\r\n" + String.Join("\r\n", uniqueCodes.Keys)); } }
private static void GetModelStatistics(int modelVersion) { var converter = new Utility.VietConverter(); Dictionary <char, char> mapLetterToCode = converter.AccentToCodeMap; List <ILanguageModel> models = Trainer.LoadModel(Directory.GetFiles(@"..\..\..\..\AccenTypeWeb\App_Data\", "ngram*"), modelVersion); var uniqueAccentStrings = new HashSet <string>(); var uniqueAccentCodeCount = new HashSet <int>(); for (int i = 0; i < models.Count; i++) { var perModelUniqueAccentStrings = new HashSet <string>(); var perModelUniqueAccentCodes = new HashSet <string>(); var perModelUniqueAccentCodeCount = new HashSet <int>(); var perModelUniqueAccentRawStrings = new HashSet <string>(); Model0 ni = models[i] as Model0; Dictionary <int, Dictionary <string, int> > map = ni.Map; foreach (Dictionary <string, int> mapStringToCount in map.Values) { var mapCodeToStrings = new Dictionary <string, List <string> >(); foreach (string accString in mapStringToCount.Keys) { uniqueAccentStrings.Add(accString); uniqueAccentCodeCount.Add(mapStringToCount[accString]); perModelUniqueAccentStrings.Add(accString); perModelUniqueAccentRawStrings.Add(String.Join(String.Empty, accString.Select( l => converter.RawCharMap.ContainsKey(l) ? l : converter.AccentToRawCharMap[l] ))); string accCode = String.Join(String.Empty, accString.Select(l => mapLetterToCode[l])); perModelUniqueAccentCodes.Add(accCode); perModelUniqueAccentCodeCount.Add(mapStringToCount[accString]); if (!mapCodeToStrings.ContainsKey(accCode)) { mapCodeToStrings.Add(accCode, new List <string>()); } mapCodeToStrings[accCode].Add(accString); } foreach (string code in mapCodeToStrings.Keys) { if (mapCodeToStrings[code].Count > 1) { Console.WriteLine(String.Join(",", mapCodeToStrings[code])); } } } Console.WriteLine("MODEL {0}", i + 1); Console.WriteLine("# hashed entries: {0}", map.Count); Console.WriteLine("Average code length: {0}", (double)map.Sum(he => (double)he.Value.Sum(acc => acc.Key.Length) / he.Value.Count) / map.Count ); Console.WriteLine("Max code length: {0}", map.Max(he => he.Value.Max(acc => acc.Key.Length))); Console.WriteLine("Max # of codes per hash entry: {0}", map.Max(he => he.Value.Count)); Console.WriteLine("# of codes: {0}", map.Sum(he => he.Value.Count)); Console.WriteLine("# of unique strings: {0}", perModelUniqueAccentStrings.Count); Console.WriteLine("# of unique raw strings: {0}", perModelUniqueAccentRawStrings.Count); Console.WriteLine("# of unique codes: {0}", perModelUniqueAccentCodes.Count); Console.WriteLine("# of unique counts: {0}", perModelUniqueAccentCodeCount.Count); Console.WriteLine("Max count: {0}, min count: {1}", perModelUniqueAccentCodeCount.Max(), perModelUniqueAccentCodeCount.Min()); } Console.WriteLine("TOTAL"); Console.WriteLine("Total # of codes: {0}", uniqueAccentStrings.Count); Console.WriteLine("Total # of counts: {0}", uniqueAccentCodeCount.Count); Console.WriteLine("Max count: {0}, min count: {1}", uniqueAccentCodeCount.Max(), uniqueAccentCodeCount.Min()); }