public static void Main(string[] args)
            if (args.Length != 5 && args.Length != 7)
                Console.Error.WriteLine("Usage: TestLanguageIdentifier liSpec cap distros.bin.gz [wiki|twit|none] tests.txt [nfolds fold]");
                Console.Error.WriteLine("where liSpec: Vector     -- cosine similarity of gram vectors");
                Console.Error.WriteLine("              Likely     -- probability of last char of trigram conditioned on prefix");
                Console.Error.WriteLine("              LikelySp   -- probability of last char of trigram conditioned on prefix, smart prior");
                Console.Error.WriteLine("              LikelyAp   -- probability of last char of trigram conditioned on prefix, additive prior");
                Console.Error.WriteLine("              Rank,rs    -- rank corr; rs = sf|sr|kt");
                Console.Error.WriteLine("      cap indicates how many grams per language to load (-1 means use all)");
                Console.Error.WriteLine("      distros.bin.gz is the file containing language profiles procuded by CompileDistros");
                Console.Error.WriteLine("      tests.txt contains a list of test file names, one per language, each file name of the form xx_*,");
                Console.Error.WriteLine("           where xx is the ISO 639-1 language code");
                Console.Error.WriteLine("      nfolds (optionally) indicates the number of folds for n-fold cross validation");
                Console.Error.WriteLine("      fold (optionally) indicates the number of the fold to test on");
                var sw      = Stopwatch.StartNew();
                var li      = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1]));
                var cleaner = Cleaning.MakeCleaner(args[3]);
                var nfolds  = args.Length == 5 ? -1 : int.Parse(args[5]);
                var fold    = args.Length == 5 ? -1 : int.Parse(args[6]);

                var confusionMatrix = new Dictionary <string, long>();
                var set             = new HashSet <string>();
                foreach (var pair in LabeledData.Read(args[4], nfolds, fold, false))
                    var lang = li.Identify(cleaner(pair.text)); // Calling the language identifier -- it all happens here!
                    if (pair.lang == "")
                    var key = lang + "\t" + pair.lang;
                    if (!confusionMatrix.ContainsKey(key))
                        confusionMatrix[key] = 0;
                var langs = set.OrderBy(x => x).ToArray();
                for (int i = 0; i < langs.Length; i++)
                    Console.Write("\t{0}", langs[i]);
                for (int j = 0; j < langs.Length; j++)
                    for (int i = 0; i < langs.Length; i++)
                        Console.Write("\t{0}", Lookup(confusionMatrix, langs[j] + "\t" + langs[i]));
                Console.Error.WriteLine("Done. Job took {0} seconds", sw.ElapsedMilliseconds * 0.001);
예제 #2
 public static void Main(string[] args)
     if (args.Length != 8 && args.Length != 10)
         Console.Error.WriteLine("Usage: CompileDistro [char|word] lo hi [ncf|tlc] maxTokens [wiki|twit|none] files.txt distros.bin.gz [nfolds fold]");
         Console.Error.WriteLine("  where [char|word] indicates whether to use character- or word-based n-grams");
         Console.Error.WriteLine("        lo is the minimum gram length");
         Console.Error.WriteLine("        lo is the maximum gram length");
         Console.Error.WriteLine("        [ncf|tlc] incicate whether to do no-case-folding or to-lower-case");
         Console.Error.WriteLine("        maxTokens is the maximum number of most-frequent tokens that should be retained per language");
         Console.Error.WriteLine("        [wiki|twit|none] incicates which text cleaning scheme to use");
         Console.Error.WriteLine("        file.txt contains a list of training file names, one per language, each file name of the form xx_*,");
         Console.Error.WriteLine("           where xx is the ISO 639-1 language code");
         Console.Error.WriteLine("        distros.bin.gz is the name of the languages profile file that is to be generated");
         Console.Error.WriteLine("        nfolds (optionally) indicates the number of folds for n-fold cross validation");
         Console.Error.WriteLine("        fold (optionally) indicates the number of the fold to exclude from the profile");
         const string tmpFile     = "xml-to-txt.tmp";
         var          tokenizer   = Tokenization.Tokenizer(args[0]);
         int          lo          = int.Parse(args[1]);
         int          hi          = int.Parse(args[2]);
         bool         tlc         = args[3] == "tlc";
         int          n           = int.Parse(args[4]);
         var          cleaner     = Cleaning.MakeCleaner(args[5]);
         var          inFileNames = File.ReadAllLines(args[6]);
         var          nfolds      = args.Length == 8 ? -1 : int.Parse(args[8]);
         var          fold        = args.Length == 8 ? -1 : int.Parse(args[9]);
         using (var bw = new BinaryWriter(new GZipStream(new FileStream(args[7], FileMode.Create, FileAccess.Write), CompressionMode.Compress))) {
             foreach (var inFileName in inFileNames)
                 var  langCode = inFileName.Substring(0, inFileName.IndexOf("_"));
                 long absCnt   = 0;
                 using (var rd = new StreamReader(inFileName)) {
                     using (var wr = new StreamWriter(tmpFile)) {
                         for (;;)
                             var text = rd.ReadLine();
                             if (text == null)
                             if (fold == -1 || (absCnt % nfolds) != fold)
                 using (var rd = new StreamReader(tmpFile)) {
                     var distro = new Dictionary <string, long>();
                     foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi))
                         if (!distro.ContainsKey(tok))
                             distro[tok] = 1;
                     var orderedDistro = n > 0
         ? distro.OrderByDescending(x => x.Value).Take(n)
         : distro.OrderByDescending(x => x.Value);
                     long grams = 0;
                     long occs  = 0;
                     foreach (var kv in orderedDistro)
                         occs += kv.Value;
                     Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs);