Ejemplo n.º 1
0
        private static void GetUniqueAccentCodes(int modelVersion)
        {
            var converter = new Utility.VietConverter();
            Dictionary <char, char> accentToCodeMap = converter.AccentToCodeMap;

            List <ILanguageModel> models = Trainer.LoadModel(Directory.GetFiles(basePath, "ngram*"), modelVersion);

            for (int i = 0; i < models.Count; i++)
            {
                var    uniqueCodes = new SortedList <string, int>();
                Model0 ni          = models[i] as Model0;

                IEnumerable <string> codes = ni.Map.Select(he => he.Value
                                                           .Select(acc => String.Join("", acc.Key.Select(c => accentToCodeMap[c]))))
                                             .SelectMany(s => s);

                foreach (string code in codes)
                {
                    if (!uniqueCodes.ContainsKey(code))
                    {
                        uniqueCodes.Add(code, 0);
                    }
                }
                File.WriteAllText("uniquecodes_" + (i + 1),
                                  uniqueCodes.Count + "\r\n" + String.Join("\r\n", uniqueCodes.Keys));
            }
        }
Ejemplo n.º 2
0
        private static void GetModelStatistics(int modelVersion)
        {
            var converter = new Utility.VietConverter();
            Dictionary <char, char> mapLetterToCode = converter.AccentToCodeMap;

            List <ILanguageModel> models = Trainer.LoadModel(Directory.GetFiles(@"..\..\..\..\AccenTypeWeb\App_Data\", "ngram*"), modelVersion);

            var uniqueAccentStrings   = new HashSet <string>();
            var uniqueAccentCodeCount = new HashSet <int>();

            for (int i = 0; i < models.Count; i++)
            {
                var perModelUniqueAccentStrings    = new HashSet <string>();
                var perModelUniqueAccentCodes      = new HashSet <string>();
                var perModelUniqueAccentCodeCount  = new HashSet <int>();
                var perModelUniqueAccentRawStrings = new HashSet <string>();

                Model0 ni = models[i] as Model0;
                Dictionary <int, Dictionary <string, int> > map = ni.Map;
                foreach (Dictionary <string, int> mapStringToCount in map.Values)
                {
                    var mapCodeToStrings = new Dictionary <string, List <string> >();
                    foreach (string accString in mapStringToCount.Keys)
                    {
                        uniqueAccentStrings.Add(accString);
                        uniqueAccentCodeCount.Add(mapStringToCount[accString]);

                        perModelUniqueAccentStrings.Add(accString);
                        perModelUniqueAccentRawStrings.Add(String.Join(String.Empty, accString.Select(
                                                                           l => converter.RawCharMap.ContainsKey(l) ? l : converter.AccentToRawCharMap[l]
                                                                           )));

                        string accCode = String.Join(String.Empty, accString.Select(l => mapLetterToCode[l]));
                        perModelUniqueAccentCodes.Add(accCode);
                        perModelUniqueAccentCodeCount.Add(mapStringToCount[accString]);

                        if (!mapCodeToStrings.ContainsKey(accCode))
                        {
                            mapCodeToStrings.Add(accCode, new List <string>());
                        }
                        mapCodeToStrings[accCode].Add(accString);
                    }

                    foreach (string code in mapCodeToStrings.Keys)
                    {
                        if (mapCodeToStrings[code].Count > 1)
                        {
                            Console.WriteLine(String.Join(",", mapCodeToStrings[code]));
                        }
                    }
                }
                Console.WriteLine("MODEL {0}", i + 1);
                Console.WriteLine("# hashed entries: {0}", map.Count);
                Console.WriteLine("Average code length: {0}",
                                  (double)map.Sum(he => (double)he.Value.Sum(acc => acc.Key.Length) / he.Value.Count) / map.Count
                                  );
                Console.WriteLine("Max code length: {0}", map.Max(he => he.Value.Max(acc => acc.Key.Length)));
                Console.WriteLine("Max # of codes per hash entry: {0}", map.Max(he => he.Value.Count));
                Console.WriteLine("# of codes: {0}", map.Sum(he => he.Value.Count));
                Console.WriteLine("# of unique strings: {0}", perModelUniqueAccentStrings.Count);
                Console.WriteLine("# of unique raw strings: {0}", perModelUniqueAccentRawStrings.Count);
                Console.WriteLine("# of unique codes: {0}", perModelUniqueAccentCodes.Count);
                Console.WriteLine("# of unique counts: {0}", perModelUniqueAccentCodeCount.Count);
                Console.WriteLine("Max count: {0}, min count: {1}",
                                  perModelUniqueAccentCodeCount.Max(), perModelUniqueAccentCodeCount.Min());
            }

            Console.WriteLine("TOTAL");
            Console.WriteLine("Total # of codes: {0}", uniqueAccentStrings.Count);
            Console.WriteLine("Total # of counts: {0}", uniqueAccentCodeCount.Count);
            Console.WriteLine("Max count: {0}, min count: {1}", uniqueAccentCodeCount.Max(), uniqueAccentCodeCount.Min());
        }