Esempio n. 1
0
 public string Check(string text)
 {
     var cleaner = Cleaning.MakeCleaner("none");
     List<string> wrds = new List<string>();
     (text ?? "").Split('.', StringSplitOptions.RemoveEmptyEntries).ToList().ForEach(t => wrds.AddRange(t.Split(' ', StringSplitOptions.RemoveEmptyEntries)));
     return string.Join(',', wrds.Select(t => _li.Identify(cleaner(t))).Distinct());
 }
Esempio n. 2
0
 public List<Models.LangModel> CheckPost()
 {
     string text = HttpContext.Request.Form["text"];
     var cleaner = Cleaning.MakeCleaner("none");
     List<string> wrds = new List<string>();
     (text ?? "").Split('.', StringSplitOptions.RemoveEmptyEntries).ToList().ForEach(t => wrds.AddRange(t.Split(' ', StringSplitOptions.RemoveEmptyEntries)));
     return wrds.Select(t => _li.Identify(cleaner(t))).GroupBy(t => t).Select(t1 => new Models.LangModel() { lang = t1.Key, count = t1.Count() }).ToList();
 }
Esempio n. 3
0
 public string ProfileCheck(string owner, string lang_prof, string text)
 {
     var path = System.IO.Path.Combine(_env.WebRootPath, "Content", "UserData", owner, lang_prof, (lang_prof ?? "").Substring(0, lang_prof.LastIndexOf('_')) + ".bin.gz");
     var lip = LanguageIdentifier.New(path, "Vector", -1);
     var cleaner = Cleaning.MakeCleaner("none");
     List<string> wrds = new List<string>();
     (text ?? "").Split('.', StringSplitOptions.RemoveEmptyEntries).ToList().ForEach(t => wrds.AddRange(t.Split(' ', StringSplitOptions.RemoveEmptyEntries)));
     return string.Join(',', wrds.Select(t => lip.Identify(cleaner(t))).Distinct());
 }
Esempio n. 4
0
 public List<Models.LangModel> ProfilePost()
 {
     string owner = HttpContext.Request.Form["owner"];
     string lang_prof = HttpContext.Request.Form["lang_prof"];
     var path = System.IO.Path.Combine(_env.WebRootPath, "Content", "UserData", owner, lang_prof, (lang_prof ?? "").Substring(0, lang_prof.LastIndexOf('_')) + ".bin.gz");
     var lip = LanguageIdentifier.New(path, "Vector", -1);
     string text = HttpContext.Request.Form["text"];
     var cleaner = Cleaning.MakeCleaner("none");
     List<string> wrds = new List<string>();
     (text ?? "").Split('.', StringSplitOptions.RemoveEmptyEntries).ToList().ForEach(t => wrds.AddRange(t.Split(' ', StringSplitOptions.RemoveEmptyEntries)));
     return wrds.Select(t => lip.Identify(cleaner(t))).GroupBy(t => t).Select(t1 => new Models.LangModel() { lang = t1.Key, count = t1.Count() }).ToList();
 }
Esempio n. 5
0
        public static void Train(Models.LangProfile profile)
        {
            var  tokenizer = Tokenization.Tokenizer(profile.PType);
            int  lo        = int.Parse(profile.MinGram);
            int  hi        = int.Parse(profile.MaxGram);
            bool tlc       = profile.CaseSensitive == "tlc";
            int  n         = -1; // int.Parse(args[4]);
            var  cleaner   = Cleaning.MakeCleaner("none");
            //var inFileNames = File.ReadAllLines(args[6]);
            //var inFileNames = profile.Files.Select(t => t. Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories);
            var nfolds = -1;
            var fold   = -1;
            //string out_profile = Path.Combine(Directory.GetCurrentDirectory(), profile.ProfileName + ".bin.gz");
            //string out_profile = Path.Combine(profile.Path, profile.ProfileName + ".bin.gz");
            string out_profile = profile.ProfileFilePath;

            using (var bw = new BinaryWriter(new GZipStream(new FileStream(out_profile, FileMode.Create, FileAccess.Write), CompressionMode.Compress)))
            {
                bw.Write(profile.PType);
                bw.Write(lo);
                bw.Write(hi);
                bw.Write(tlc);
                bw.Write(profile.Files.Count());
                foreach (var eafile in profile.Files)
                {
                    //var langCode = inFileName.Substring(0, inFileName.IndexOf("_"));
                    var          langCode = eafile.Label;
                    long         absCnt   = 0;
                    MemoryStream tmpFile  = new MemoryStream();
                    using (var rd = new StreamReader(eafile.FilePath))
                    {
                        using (var wr = new StreamWriter(tmpFile))
                        {
                            for (; ;)
                            {
                                var text = rd.ReadLine();
                                if (text == null)
                                {
                                    break;
                                }
                                if (fold == -1 || (absCnt % nfolds) != fold)
                                {
                                    wr.WriteLine(cleaner(text));
                                }
                                absCnt++;
                            }
                        }
                    }
                    using (var rd = new StreamReader(new MemoryStream(tmpFile.ToArray())))
                    {
                        var distro = new Dictionary <string, long>();
                        foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi))
                        {
                            if (!distro.ContainsKey(tok))
                            {
                                distro[tok] = 1;
                            }
                            else
                            {
                                distro[tok]++;
                            }
                        }
                        var orderedDistro = n > 0
                          ? distro.OrderByDescending(x => x.Value).Take(n)
                          : distro.OrderByDescending(x => x.Value);
                        bw.Write(langCode);
                        bw.Write(orderedDistro.LongCount());
                        long grams = 0;
                        long occs  = 0;
                        foreach (var kv in orderedDistro)
                        {
                            bw.Write(kv.Key);
                            bw.Write(kv.Value);
                            grams++;
                            occs += kv.Value;
                        }
                        Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs);
                    }
                }
            }
        }