public static void Main(string[] args) { if (args.Length != 5 && args.Length != 7) { Console.Error.WriteLine("Usage: TestLanguageIdentifier liSpec cap distros.bin.gz [wiki|twit|none] tests.txt [nfolds fold]"); Console.Error.WriteLine("where liSpec: Vector -- cosine similarity of gram vectors"); Console.Error.WriteLine(" Likely -- probability of last char of trigram conditioned on prefix"); Console.Error.WriteLine(" LikelySp -- probability of last char of trigram conditioned on prefix, smart prior"); Console.Error.WriteLine(" LikelyAp -- probability of last char of trigram conditioned on prefix, additive prior"); Console.Error.WriteLine(" Rank,rs -- rank corr; rs = sf|sr|kt"); Console.Error.WriteLine(" cap indicates how many grams per language to load (-1 means use all)"); Console.Error.WriteLine(" distros.bin.gz is the file containing language profiles procuded by CompileDistros"); Console.Error.WriteLine(" tests.txt contains a list of test file names, one per language, each file name of the form xx_*,"); Console.Error.WriteLine(" where xx is the ISO 639-1 language code"); Console.Error.WriteLine(" nfolds (optionally) indicates the number of folds for n-fold cross validation"); Console.Error.WriteLine(" fold (optionally) indicates the number of the fold to test on"); } else { var sw = Stopwatch.StartNew(); var li = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1])); var cleaner = Cleaning.MakeCleaner(args[3]); var nfolds = args.Length == 5 ? -1 : int.Parse(args[5]); var fold = args.Length == 5 ? -1 : int.Parse(args[6]); var confusionMatrix = new Dictionary <string, long>(); var set = new HashSet <string>(); foreach (var pair in LabeledData.Read(args[4], nfolds, fold, false)) { var lang = li.Identify(cleaner(pair.text)); // Calling the language identifier -- it all happens here! if (pair.lang == "") { continue; } var key = lang + "\t" + pair.lang; set.Add(lang); set.Add(pair.lang); if (!confusionMatrix.ContainsKey(key)) { confusionMatrix[key] = 0; } confusionMatrix[key]++; } var langs = set.OrderBy(x => x).ToArray(); for (int i = 0; i < langs.Length; i++) { Console.Write("\t{0}", langs[i]); } Console.WriteLine(); for (int j = 0; j < langs.Length; j++) { Console.Write(langs[j]); for (int i = 0; i < langs.Length; i++) { Console.Write("\t{0}", Lookup(confusionMatrix, langs[j] + "\t" + langs[i])); } Console.WriteLine(); } Console.Error.WriteLine("Done. Job took {0} seconds", sw.ElapsedMilliseconds * 0.001); } }
public static void Main(string[] args) { if (args.Length != 8 && args.Length != 10) { Console.Error.WriteLine("Usage: CompileDistro [char|word] lo hi [ncf|tlc] maxTokens [wiki|twit|none] files.txt distros.bin.gz [nfolds fold]"); Console.Error.WriteLine(" where [char|word] indicates whether to use character- or word-based n-grams"); Console.Error.WriteLine(" lo is the minimum gram length"); Console.Error.WriteLine(" lo is the maximum gram length"); Console.Error.WriteLine(" [ncf|tlc] incicate whether to do no-case-folding or to-lower-case"); Console.Error.WriteLine(" maxTokens is the maximum number of most-frequent tokens that should be retained per language"); Console.Error.WriteLine(" [wiki|twit|none] incicates which text cleaning scheme to use"); Console.Error.WriteLine(" file.txt contains a list of training file names, one per language, each file name of the form xx_*,"); Console.Error.WriteLine(" where xx is the ISO 639-1 language code"); Console.Error.WriteLine(" distros.bin.gz is the name of the languages profile file that is to be generated"); Console.Error.WriteLine(" nfolds (optionally) indicates the number of folds for n-fold cross validation"); Console.Error.WriteLine(" fold (optionally) indicates the number of the fold to exclude from the profile"); } else { const string tmpFile = "xml-to-txt.tmp"; var tokenizer = Tokenization.Tokenizer(args[0]); int lo = int.Parse(args[1]); int hi = int.Parse(args[2]); bool tlc = args[3] == "tlc"; int n = int.Parse(args[4]); var cleaner = Cleaning.MakeCleaner(args[5]); var inFileNames = File.ReadAllLines(args[6]); var nfolds = args.Length == 8 ? -1 : int.Parse(args[8]); var fold = args.Length == 8 ? -1 : int.Parse(args[9]); using (var bw = new BinaryWriter(new GZipStream(new FileStream(args[7], FileMode.Create, FileAccess.Write), CompressionMode.Compress))) { bw.Write(args[0]); bw.Write(lo); bw.Write(hi); bw.Write(tlc); bw.Write(inFileNames.Length); foreach (var inFileName in inFileNames) { var langCode = inFileName.Substring(0, inFileName.IndexOf("_")); long absCnt = 0; using (var rd = new StreamReader(inFileName)) { using (var wr = new StreamWriter(tmpFile)) { for (;;) { var text = rd.ReadLine(); if (text == null) { break; } if (fold == -1 || (absCnt % nfolds) != fold) { wr.WriteLine(cleaner(text)); } absCnt++; } } } using (var rd = new StreamReader(tmpFile)) { var distro = new Dictionary <string, long>(); foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi)) { if (!distro.ContainsKey(tok)) { distro[tok] = 1; } else { distro[tok]++; } } var orderedDistro = n > 0 ? distro.OrderByDescending(x => x.Value).Take(n) : distro.OrderByDescending(x => x.Value); bw.Write(langCode); bw.Write(orderedDistro.LongCount()); long grams = 0; long occs = 0; foreach (var kv in orderedDistro) { bw.Write(kv.Key); bw.Write(kv.Value); grams++; occs += kv.Value; } Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs); } } } } }