static void MyTestMultiLngs() { var args = new string[] { "Vector", "-1", "langprofiles-char-1_5-nfc-all.bin.gz", "none", //"parsed-files.txt" "test_lst.txt" }; var sw = Stopwatch.StartNew(); var li = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1])); var cleaner = Cleaning.MakeCleaner(args[3]); var nfolds = args.Length == 5 ? -1 : int.Parse(args[5]); var fold = args.Length == 5 ? -1 : int.Parse(args[6]); var txt1 = $"கர்த்தருக்குக் காத்திருக்கிறவர்களே"; var txt2 = $"defined in RFC2030, where it reads"; var txt3 = $"நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads"; var txt4 = $"defined in RFC2030, where it reads, நீங்களெல்லாரும் திடமனதாயிருங்கள்"; string[] tstxt = new string[] { txt1, txt2, txt3, txt4 }; tstxt.ToList().ForEach(t => { //List<string> lss = new List<string>(); //Console.WriteLine($"{t} - {string.Join(",", t.Select<char, string>(t1 => li.Identify(cleaner(t1.ToString()))).Distinct<string>())}"); //t.Split(' ').ToList().ForEach(t1 => Console.WriteLine($"{t1} - {li.Identify(cleaner(t1))}")); Console.WriteLine($"{t} - {li.Identify(cleaner(t))}"); }); Console.Read(); }
static void MyTest(string[] args) { args = new string[] { "Vector", "-1", "langprofiles-char-1_5-nfc-all.bin.gz", "none", //"parsed-files.txt" "test_lst.txt" }; var sw = Stopwatch.StartNew(); var li = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1])); var cleaner = Cleaning.MakeCleaner(args[3]); var nfolds = args.Length == 5 ? -1 : int.Parse(args[5]); var fold = args.Length == 5 ? -1 : int.Parse(args[6]); var confusionMatrix = new Dictionary <string, long>(); var set = new HashSet <string>(); //var lang = li.Identify(cleaner("கர்த்தருக்குக் காத்திருக்கிறவர்களே, நீங்களெல்லாரும் திடமனதாயிருங்கள், அவர் உங்கள் இருதயத்தை ஸ்திரப்படுத்துவார்.")); //lang = li.Identify(cleaner("defined in RFC2030, where it reads")); Console.WriteLine($"கர்த்தருக்குக் காத்திருக்கிறவர்களே {li.Identify(cleaner("கர்த்தருக்குக் காத்திருக்கிறவர்களே, நீங்களெல்லாரும் திடமனதாயிருங்கள், அவர் உங்கள் இருதயத்தை ஸ்திரப்படுத்துவார்."))}"); Console.WriteLine($"defined in RFC2030, where it reads {li.Identify(cleaner("defined in RFC2030, where it reads"))}"); Console.WriteLine($"நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads {li.Identify(cleaner("நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads"))}"); Console.WriteLine($"defined in RFC2030, where it reads, நீங்களெல்லாரும் திடமனதாயிருங்கள் {li.Identify(cleaner("நீங்களெல்லாரும் திடமனதாயிருங்கள், defined in RFC2030, where it reads"))}"); Console.Read(); }
public static void ProcessInline() { //string[] args = new string[] //{ // "content" //}; //var inFileNames = File.ReadAllLines(args[0]); //var inFileNames = new string[] //{ //}; foreach (var inFileName in Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories)) //foreach (var inFileName in inFileNames) { var langCode = Path.GetFileNameWithoutExtension(inFileName).Substring(0, Path.GetFileNameWithoutExtension(inFileName).IndexOf("wiki")); var outFile = Path.Combine(Path.GetDirectoryName(inFileName), langCode + "_" + Path.GetFileNameWithoutExtension(inFileName) + "_parsed.txt"); var wikiformat = !inFileName.Contains("abst") ? "content" : "abstract"; //var wikiformat = "abstract"; long absCnt = 0; if (wikiformat == "content") { using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false))) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { for (; ;) { var text = rd.Read(); if (text == null) { break; } wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } else { using (var rd = new XmlTextReader(inFileName)) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { while (rd.Read()) { if (rd.IsStartElement("abstract") && !rd.IsEmptyElement) { var text = rd.ReadElementString("abstract"); wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } } Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt); } }
public static void ProcessCmdline(string[] args) { if (args.Length != 2 || (args[1] != "abstract" && args[1] != "content")) { Console.Error.WriteLine("Usage: ParseWikipedia xml-files.txt [abstract|content]"); } else { var inFileNames = File.ReadAllLines(args[0]); foreach (var inFileName in inFileNames) { var langCode = inFileName.Substring(0, inFileName.IndexOf("wiki")); var outFile = langCode + "_parsed.txt"; long absCnt = 0; if (args[1] == "content") { using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false))) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { for (; ;) { var text = rd.Read(); if (text == null) { break; } wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } else { using (var rd = new XmlTextReader(inFileName)) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { while (rd.Read()) { if (rd.IsStartElement("abstract") && !rd.IsEmptyElement) { var text = rd.ReadElementString("abstract"); wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } } Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt); } } }
public static void ProcessInline() { string[] args = new string[] { "char", //"word", "1", "5", "ncf", "-1", "none", "langprofiles-char-1_5-nfc-all.bin.gz" }; //var infils = new Dictionary<string, string>() //{ // { "aa", "Abkhazian.txt" }, // { "ab", "Afar.txt" }, //}; //const string tmpFile = "xml-to-txt.tmp"; var tokenizer = Tokenization.Tokenizer(args[0]); int lo = int.Parse(args[1]); int hi = int.Parse(args[2]); bool tlc = args[3] == "tlc"; int n = int.Parse(args[4]); var cleaner = Cleaning.MakeCleaner(args[5]); //var inFileNames = File.ReadAllLines(args[6]); var inFileNames = Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories); var nfolds = args.Length == 7 ? -1 : int.Parse(args[7]); var fold = args.Length == 7 ? -1 : int.Parse(args[8]); string out_profile = Path.Combine(Directory.GetCurrentDirectory(), args[6]); using (var bw = new BinaryWriter(new GZipStream(new FileStream(out_profile, FileMode.Create, FileAccess.Write), CompressionMode.Compress))) { bw.Write(args[0]); bw.Write(lo); bw.Write(hi); bw.Write(tlc); bw.Write(inFileNames.Count()); foreach (var inFileName in inFileNames) { //var langCode = inFileName.Substring(0, inFileName.IndexOf("_")); var langCode = Path.GetFileNameWithoutExtension(inFileName).Substring(0, Path.GetFileNameWithoutExtension(inFileName).IndexOf("_")); long absCnt = 0; MemoryStream tmpFile = new MemoryStream(); using (var rd = new StreamReader(inFileName)) { using (var wr = new StreamWriter(tmpFile)) { for (; ;) { var text = rd.ReadLine(); if (text == null) { break; } if (fold == -1 || (absCnt % nfolds) != fold) { wr.WriteLine(cleaner(text)); } absCnt++; } } } using (var rd = new StreamReader(new MemoryStream(tmpFile.ToArray()))) { var distro = new Dictionary <string, long>(); foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi)) { if (!distro.ContainsKey(tok)) { distro[tok] = 1; } else { distro[tok]++; } } var orderedDistro = n > 0 ? distro.OrderByDescending(x => x.Value).Take(n) : distro.OrderByDescending(x => x.Value); bw.Write(langCode); bw.Write(orderedDistro.LongCount()); long grams = 0; long occs = 0; foreach (var kv in orderedDistro) { bw.Write(kv.Key); bw.Write(kv.Value); grams++; occs += kv.Value; } Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs); } } } }
public static void ProcesCmdLine(string[] args) { if (args.Length != 8 && args.Length != 10) { Console.Error.WriteLine("Usage: CompileDistro [char|word] lo hi [ncf|tlc] maxTokens [wiki|twit|none] files.txt distros.bin.gz [nfolds fold]"); Console.Error.WriteLine(" where [char|word] indicates whether to use character- or word-based n-grams"); Console.Error.WriteLine(" lo is the minimum gram length"); Console.Error.WriteLine(" lo is the maximum gram length"); Console.Error.WriteLine(" [ncf|tlc] incicate whether to do no-case-folding or to-lower-case"); Console.Error.WriteLine(" maxTokens is the maximum number of most-frequent tokens that should be retained per language"); Console.Error.WriteLine(" [wiki|twit|none] incicates which text cleaning scheme to use"); Console.Error.WriteLine(" file.txt contains a list of training file names, one per language, each file name of the form xx_*,"); Console.Error.WriteLine(" where xx is the ISO 639-1 language code"); Console.Error.WriteLine(" distros.bin.gz is the name of the languages profile file that is to be generated"); Console.Error.WriteLine(" nfolds (optionally) indicates the number of folds for n-fold cross validation"); Console.Error.WriteLine(" fold (optionally) indicates the number of the fold to exclude from the profile"); } else { const string tmpFile = "xml-to-txt.tmp"; var tokenizer = Tokenization.Tokenizer(args[0]); int lo = int.Parse(args[1]); int hi = int.Parse(args[2]); bool tlc = args[3] == "tlc"; int n = int.Parse(args[4]); var cleaner = Cleaning.MakeCleaner(args[5]); var inFileNames = File.ReadAllLines(args[6]); var nfolds = args.Length == 8 ? -1 : int.Parse(args[8]); var fold = args.Length == 8 ? -1 : int.Parse(args[9]); using (var bw = new BinaryWriter(new GZipStream(new FileStream(args[7], FileMode.Create, FileAccess.Write), CompressionMode.Compress))) { bw.Write(args[0]); bw.Write(lo); bw.Write(hi); bw.Write(tlc); bw.Write(inFileNames.Length); foreach (var inFileName in inFileNames) { var langCode = inFileName.Substring(0, inFileName.IndexOf("_")); long absCnt = 0; using (var rd = new StreamReader(inFileName)) { using (var wr = new StreamWriter(tmpFile)) { for (; ;) { var text = rd.ReadLine(); if (text == null) { break; } if (fold == -1 || (absCnt % nfolds) != fold) { wr.WriteLine(cleaner(text)); } absCnt++; } } } using (var rd = new StreamReader(tmpFile)) { var distro = new Dictionary <string, long>(); foreach (var tok in tokenizer(EnumFromRd(rd), tlc, lo, hi)) { if (!distro.ContainsKey(tok)) { distro[tok] = 1; } else { distro[tok]++; } } var orderedDistro = n > 0 ? distro.OrderByDescending(x => x.Value).Take(n) : distro.OrderByDescending(x => x.Value); bw.Write(langCode); bw.Write(orderedDistro.LongCount()); long grams = 0; long occs = 0; foreach (var kv in orderedDistro) { bw.Write(kv.Key); bw.Write(kv.Value); grams++; occs += kv.Value; } Console.WriteLine("{0}\t{1}\t{2}\t{3}", langCode, absCnt, grams, occs); } } } } }
static void InHousrTest(string[] args) { args = new string[] { "Vector", "-1", //"langprofiles-char-1_5-nfc-all.bin.gz", "langprofiles-word-1_5-nfc-10k.bin.gz", "wiki", //"parsed-files.txt" "test_lst.txt" }; if (args.Length != 5 && args.Length != 7) { Console.Error.WriteLine("Usage: TestLanguageIdentifier liSpec cap distros.bin.gz [wiki|twit|none] tests.txt [nfolds fold]"); Console.Error.WriteLine("where liSpec: Vector -- cosine similarity of gram vectors"); Console.Error.WriteLine(" Likely -- probability of last char of trigram conditioned on prefix"); Console.Error.WriteLine(" LikelySp -- probability of last char of trigram conditioned on prefix, smart prior"); Console.Error.WriteLine(" LikelyAp -- probability of last char of trigram conditioned on prefix, additive prior"); Console.Error.WriteLine(" Rank,rs -- rank corr; rs = sf|sr|kt"); Console.Error.WriteLine(" cap indicates how many grams per language to load (-1 means use all)"); Console.Error.WriteLine(" distros.bin.gz is the file containing language profiles procuded by CompileDistros"); Console.Error.WriteLine(" tests.txt contains a list of test file names, one per language, each file name of the form xx_*,"); Console.Error.WriteLine(" where xx is the ISO 639-1 language code"); Console.Error.WriteLine(" nfolds (optionally) indicates the number of folds for n-fold cross validation"); Console.Error.WriteLine(" fold (optionally) indicates the number of the fold to test on"); } else { var sw = Stopwatch.StartNew(); var li = LanguageIdentifier.New(args[2], args[0], int.Parse(args[1])); var cleaner = Cleaning.MakeCleaner(args[3]); var nfolds = args.Length == 5 ? -1 : int.Parse(args[5]); var fold = args.Length == 5 ? -1 : int.Parse(args[6]); var confusionMatrix = new Dictionary <string, long>(); var set = new HashSet <string>(); foreach (var pair in LabeledData.Read(args[4], nfolds, fold, false)) { var lang = li.Identify(cleaner(pair.text)); // Calling the language identifier -- it all happens here! if (pair.lang == "") { continue; } var key = lang + "\t" + pair.lang; set.Add(lang); set.Add(pair.lang); if (!confusionMatrix.ContainsKey(key)) { confusionMatrix[key] = 0; } confusionMatrix[key]++; } var langs = set.OrderBy(x => x).ToArray(); for (int i = 0; i < langs.Length; i++) { Console.Write("\t{0}", langs[i]); } Console.WriteLine(); for (int j = 0; j < langs.Length; j++) { Console.Write(langs[j]); for (int i = 0; i < langs.Length; i++) { Console.Write("\t{0}", Lookup(confusionMatrix, langs[j] + "\t" + langs[i])); } Console.WriteLine(); } Console.Error.WriteLine("Done. Job took {0} seconds", sw.ElapsedMilliseconds * 0.001); Console.ReadKey(); } }