示例#1
0
        static void MyTestMultiLngs()
        {
            var args = new string[]
            {
                "Vector",
                "-1",
                "langprofiles-char-1_5-nfc-all.bin.gz",
                "none",
                //"parsed-files.txt"
                "test_lst.txt"
            };

            var sw      = Stopwatch.StartNew();
            var li      = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1]));
            var cleaner = Cleaning.MakeCleaner(args[3]);
            var nfolds  = args.Length == 5 ? -1 : int.Parse(args[5]);
            var fold    = args.Length == 5 ? -1 : int.Parse(args[6]);

            var txt1 = $"கர்த்தருக்குக் காத்திருக்கிறவர்களே";
            var txt2 = $"defined in RFC2030, where it reads";
            var txt3 = $"நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads";
            var txt4 = $"defined in RFC2030, where it reads, நீங்களெல்லாரும் திடமனதாயிருங்கள்";

            string[] tstxt = new string[] { txt1, txt2, txt3, txt4 };
            tstxt.ToList().ForEach(t =>
            {
                //List<string> lss = new List<string>();
                //Console.WriteLine($"{t} - {string.Join(",", t.Select<char, string>(t1 => li.Identify(cleaner(t1.ToString()))).Distinct<string>())}");
                //t.Split(' ').ToList().ForEach(t1 => Console.WriteLine($"{t1} - {li.Identify(cleaner(t1))}"));
                Console.WriteLine($"{t} - {li.Identify(cleaner(t))}");
            });

            Console.Read();
        }
示例#2
0
        static void MyTest(string[] args)
        {
            args = new string[]
            {
                "Vector",
                "-1",
                "langprofiles-char-1_5-nfc-all.bin.gz",
                "none",
                //"parsed-files.txt"
                "test_lst.txt"
            };

            var sw      = Stopwatch.StartNew();
            var li      = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1]));
            var cleaner = Cleaning.MakeCleaner(args[3]);
            var nfolds  = args.Length == 5 ? -1 : int.Parse(args[5]);
            var fold    = args.Length == 5 ? -1 : int.Parse(args[6]);

            var confusionMatrix = new Dictionary <string, long>();
            var set             = new HashSet <string>();

            //var lang = li.Identify(cleaner("கர்த்தருக்குக் காத்திருக்கிறவர்களே, நீங்களெல்லாரும் திடமனதாயிருங்கள், அவர் உங்கள் இருதயத்தை ஸ்திரப்படுத்துவார்."));
            //lang = li.Identify(cleaner("defined in RFC2030, where it reads"));
            Console.WriteLine($"கர்த்தருக்குக் காத்திருக்கிறவர்களே {li.Identify(cleaner("கர்த்தருக்குக் காத்திருக்கிறவர்களே, நீங்களெல்லாரும் திடமனதாயிருங்கள், அவர் உங்கள் இருதயத்தை ஸ்திரப்படுத்துவார்."))}");
            Console.WriteLine($"defined in RFC2030, where it reads {li.Identify(cleaner("defined in RFC2030, where it reads"))}");
            Console.WriteLine($"நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads {li.Identify(cleaner("நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads"))}");
            Console.WriteLine($"defined in RFC2030, where it reads, நீங்களெல்லாரும் திடமனதாயிருங்கள் {li.Identify(cleaner("நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads"))}");

            Console.Read();
        }
        public static void ProcessInline()
        {
            //string[] args = new string[]
            //{
            //    "content"
            //};
            //var inFileNames = File.ReadAllLines(args[0]);
            //var inFileNames = new string[]
            //{

            //};
            foreach (var inFileName in Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories))
            //foreach (var inFileName in inFileNames)
            {
                var langCode   = Path.GetFileNameWithoutExtension(inFileName).Substring(0, Path.GetFileNameWithoutExtension(inFileName).IndexOf("wiki"));
                var outFile    = Path.Combine(Path.GetDirectoryName(inFileName), langCode + "_" + Path.GetFileNameWithoutExtension(inFileName) + "_parsed.txt");
                var wikiformat = !inFileName.Contains("abst") ? "content" : "abstract";
                //var wikiformat = "abstract";
                long absCnt = 0;
                if (wikiformat == "content")
                {
                    using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false)))
                    {
                        using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                        {
                            for (; ;)
                            {
                                var text = rd.Read();
                                if (text == null)
                                {
                                    break;
                                }
                                wr.WriteLine(Cleaning.CleanWiki(text));
                                absCnt++;
                            }
                        }
                    }
                }
                else
                {
                    using (var rd = new XmlTextReader(inFileName))
                    {
                        using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                        {
                            while (rd.Read())
                            {
                                if (rd.IsStartElement("abstract") && !rd.IsEmptyElement)
                                {
                                    var text = rd.ReadElementString("abstract");
                                    wr.WriteLine(Cleaning.CleanWiki(text));
                                    absCnt++;
                                }
                            }
                        }
                    }
                }
                Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt);
            }
        }
 public static void ProcessCmdline(string[] args)
 {
     if (args.Length != 2 || (args[1] != "abstract" && args[1] != "content"))
     {
         Console.Error.WriteLine("Usage: ParseWikipedia xml-files.txt [abstract|content]");
     }
     else
     {
         var inFileNames = File.ReadAllLines(args[0]);
         foreach (var inFileName in inFileNames)
         {
             var  langCode = inFileName.Substring(0, inFileName.IndexOf("wiki"));
             var  outFile  = langCode + "_parsed.txt";
             long absCnt   = 0;
             if (args[1] == "content")
             {
                 using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false)))
                 {
                     using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                     {
                         for (; ;)
                         {
                             var text = rd.Read();
                             if (text == null)
                             {
                                 break;
                             }
                             wr.WriteLine(Cleaning.CleanWiki(text));
                             absCnt++;
                         }
                     }
                 }
             }
             else
             {
                 using (var rd = new XmlTextReader(inFileName))
                 {
                     using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                     {
                         while (rd.Read())
                         {
                             if (rd.IsStartElement("abstract") && !rd.IsEmptyElement)
                             {
                                 var text = rd.ReadElementString("abstract");
                                 wr.WriteLine(Cleaning.CleanWiki(text));
                                 absCnt++;
                             }
                         }
                     }
                 }
             }
             Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt);
         }
     }
 }
        public static void ProcessInline()
        {
            string[] args = new string[]
            {
                "char",
                //"word",

                "1",
                "5",
                "ncf",
                "-1",
                "none",
                "langprofiles-char-1_5-nfc-all.bin.gz"
            };

            //var infils = new Dictionary<string, string>()
            //{
            //    { "aa", "Abkhazian.txt" },
            //    { "ab", "Afar.txt" },
            //};

            //const string tmpFile = "xml-to-txt.tmp";
            var  tokenizer = Tokenization.Tokenizer(args[0]);
            int  lo        = int.Parse(args[1]);
            int  hi        = int.Parse(args[2]);
            bool tlc       = args[3] == "tlc";
            int  n         = int.Parse(args[4]);
            var  cleaner   = Cleaning.MakeCleaner(args[5]);
            //var inFileNames = File.ReadAllLines(args[6]);
            var    inFileNames = Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories);
            var    nfolds      = args.Length == 7 ? -1 : int.Parse(args[7]);
            var    fold        = args.Length == 7 ? -1 : int.Parse(args[8]);
            string out_profile = Path.Combine(Directory.GetCurrentDirectory(), args[6]);

            using (var bw = new BinaryWriter(new GZipStream(new FileStream(out_profile, FileMode.Create, FileAccess.Write), CompressionMode.Compress)))
            {
                bw.Write(args[0]);
                bw.Write(lo);
                bw.Write(hi);
                bw.Write(tlc);
                bw.Write(inFileNames.Count());
                foreach (var inFileName in inFileNames)
                {
                    //var langCode = inFileName.Substring(0, inFileName.IndexOf("_"));
                    var          langCode = Path.GetFileNameWithoutExtension(inFileName).Substring(0, Path.GetFileNameWithoutExtension(inFileName).IndexOf("_"));
                    long         absCnt   = 0;
                    MemoryStream tmpFile  = new MemoryStream();
                    using (var rd = new StreamReader(inFileName))
                    {
                        using (var wr = new StreamWriter(tmpFile))
                        {
                            for (; ;)
                            {
                                var text = rd.ReadLine();
                                if (text == null)
                                {
                                    break;
                                }
                                if (fold == -1 || (absCnt % nfolds) != fold)
                                {
                                    wr.WriteLine(cleaner(text));
                                }
                                absCnt++;
                            }
                        }
                    }
                    using (var rd = new StreamReader(new MemoryStream(tmpFile.ToArray())))
                    {
                        var distro = new Dictionary <string, long>();
                        foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi))
                        {
                            if (!distro.ContainsKey(tok))
                            {
                                distro[tok] = 1;
                            }
                            else
                            {
                                distro[tok]++;
                            }
                        }
                        var orderedDistro = n > 0
                          ? distro.OrderByDescending(x => x.Value).Take(n)
                          : distro.OrderByDescending(x => x.Value);
                        bw.Write(langCode);
                        bw.Write(orderedDistro.LongCount());
                        long grams = 0;
                        long occs  = 0;
                        foreach (var kv in orderedDistro)
                        {
                            bw.Write(kv.Key);
                            bw.Write(kv.Value);
                            grams++;
                            occs += kv.Value;
                        }
                        Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs);
                    }
                }
            }
        }
 public static void ProcesCmdLine(string[] args)
 {
     if (args.Length != 8 && args.Length != 10)
     {
         Console.Error.WriteLine("Usage: CompileDistro [char|word] lo hi [ncf|tlc] maxTokens [wiki|twit|none] files.txt distros.bin.gz [nfolds fold]");
         Console.Error.WriteLine("  where [char|word] indicates whether to use character- or word-based n-grams");
         Console.Error.WriteLine("        lo is the minimum gram length");
         Console.Error.WriteLine("        lo is the maximum gram length");
         Console.Error.WriteLine("        [ncf|tlc] incicate whether to do no-case-folding or to-lower-case");
         Console.Error.WriteLine("        maxTokens is the maximum number of most-frequent tokens that should be retained per language");
         Console.Error.WriteLine("        [wiki|twit|none] incicates which text cleaning scheme to use");
         Console.Error.WriteLine("        file.txt contains a list of training file names, one per language, each file name of the form xx_*,");
         Console.Error.WriteLine("           where xx is the ISO 639-1 language code");
         Console.Error.WriteLine("        distros.bin.gz is the name of the languages profile file that is to be generated");
         Console.Error.WriteLine("        nfolds (optionally) indicates the number of folds for n-fold cross validation");
         Console.Error.WriteLine("        fold (optionally) indicates the number of the fold to exclude from the profile");
     }
     else
     {
         const string tmpFile     = "xml-to-txt.tmp";
         var          tokenizer   = Tokenization.Tokenizer(args[0]);
         int          lo          = int.Parse(args[1]);
         int          hi          = int.Parse(args[2]);
         bool         tlc         = args[3] == "tlc";
         int          n           = int.Parse(args[4]);
         var          cleaner     = Cleaning.MakeCleaner(args[5]);
         var          inFileNames = File.ReadAllLines(args[6]);
         var          nfolds      = args.Length == 8 ? -1 : int.Parse(args[8]);
         var          fold        = args.Length == 8 ? -1 : int.Parse(args[9]);
         using (var bw = new BinaryWriter(new GZipStream(new FileStream(args[7], FileMode.Create, FileAccess.Write), CompressionMode.Compress)))
         {
             bw.Write(args[0]);
             bw.Write(lo);
             bw.Write(hi);
             bw.Write(tlc);
             bw.Write(inFileNames.Length);
             foreach (var inFileName in inFileNames)
             {
                 var  langCode = inFileName.Substring(0, inFileName.IndexOf("_"));
                 long absCnt   = 0;
                 using (var rd = new StreamReader(inFileName))
                 {
                     using (var wr = new StreamWriter(tmpFile))
                     {
                         for (; ;)
                         {
                             var text = rd.ReadLine();
                             if (text == null)
                             {
                                 break;
                             }
                             if (fold == -1 || (absCnt % nfolds) != fold)
                             {
                                 wr.WriteLine(cleaner(text));
                             }
                             absCnt++;
                         }
                     }
                 }
                 using (var rd = new StreamReader(tmpFile))
                 {
                     var distro = new Dictionary <string, long>();
                     foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi))
                     {
                         if (!distro.ContainsKey(tok))
                         {
                             distro[tok] = 1;
                         }
                         else
                         {
                             distro[tok]++;
                         }
                     }
                     var orderedDistro = n > 0
                       ? distro.OrderByDescending(x => x.Value).Take(n)
                       : distro.OrderByDescending(x => x.Value);
                     bw.Write(langCode);
                     bw.Write(orderedDistro.LongCount());
                     long grams = 0;
                     long occs  = 0;
                     foreach (var kv in orderedDistro)
                     {
                         bw.Write(kv.Key);
                         bw.Write(kv.Value);
                         grams++;
                         occs += kv.Value;
                     }
                     Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs);
                 }
             }
         }
     }
 }
示例#7
0
        static void InHousrTest(string[] args)
        {
            args = new string[]
            {
                "Vector",
                "-1",
                //"langprofiles-char-1_5-nfc-all.bin.gz",
                "langprofiles-word-1_5-nfc-10k.bin.gz",
                "wiki",
                //"parsed-files.txt"
                "test_lst.txt"
            };
            if (args.Length != 5 && args.Length != 7)
            {
                Console.Error.WriteLine("Usage: TestLanguageIdentifier liSpec cap distros.bin.gz [wiki|twit|none] tests.txt [nfolds fold]");
                Console.Error.WriteLine("where liSpec: Vector     -- cosine similarity of gram vectors");
                Console.Error.WriteLine("              Likely     -- probability of last char of trigram conditioned on prefix");
                Console.Error.WriteLine("              LikelySp   -- probability of last char of trigram conditioned on prefix, smart prior");
                Console.Error.WriteLine("              LikelyAp   -- probability of last char of trigram conditioned on prefix, additive prior");
                Console.Error.WriteLine("              Rank,rs    -- rank corr; rs = sf|sr|kt");
                Console.Error.WriteLine("      cap indicates how many grams per language to load (-1 means use all)");
                Console.Error.WriteLine("      distros.bin.gz is the file containing language profiles procuded by CompileDistros");
                Console.Error.WriteLine("      tests.txt contains a list of test file names, one per language, each file name of the form xx_*,");
                Console.Error.WriteLine("           where xx is the ISO 639-1 language code");
                Console.Error.WriteLine("      nfolds (optionally) indicates the number of folds for n-fold cross validation");
                Console.Error.WriteLine("      fold (optionally) indicates the number of the fold to test on");
            }
            else
            {
                var sw      = Stopwatch.StartNew();
                var li      = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1]));
                var cleaner = Cleaning.MakeCleaner(args[3]);
                var nfolds  = args.Length == 5 ? -1 : int.Parse(args[5]);
                var fold    = args.Length == 5 ? -1 : int.Parse(args[6]);

                var confusionMatrix = new Dictionary <string, long>();
                var set             = new HashSet <string>();
                foreach (var pair in LabeledData.Read(args[4], nfolds, fold, false))
                {
                    var lang = li.Identify(cleaner(pair.text));  // Calling the language identifier -- it all happens here!
                    if (pair.lang == "")
                    {
                        continue;
                    }
                    var key = lang + "\t" + pair.lang;
                    set.Add(lang);
                    set.Add(pair.lang);
                    if (!confusionMatrix.ContainsKey(key))
                    {
                        confusionMatrix[key] = 0;
                    }
                    confusionMatrix[key]++;
                }
                var langs = set.OrderBy(x => x).ToArray();
                for (int i = 0; i < langs.Length; i++)
                {
                    Console.Write("\t{0}", langs[i]);
                }
                Console.WriteLine();
                for (int j = 0; j < langs.Length; j++)
                {
                    Console.Write(langs[j]);
                    for (int i = 0; i < langs.Length; i++)
                    {
                        Console.Write("\t{0}", Lookup(confusionMatrix, langs[j] + "\t" + langs[i]));
                    }
                    Console.WriteLine();
                }
                Console.Error.WriteLine("Done. Job took {0} seconds", sw.ElapsedMilliseconds * 0.001);
                Console.ReadKey();
            }
        }