public static void ProcessInline() { //string[] args = new string[] //{ // "content" //}; //var inFileNames = File.ReadAllLines(args[0]); //var inFileNames = new string[] //{ //}; foreach (var inFileName in Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories)) //foreach (var inFileName in inFileNames) { var langCode = Path.GetFileNameWithoutExtension(inFileName).Substring(0, Path.GetFileNameWithoutExtension(inFileName).IndexOf("wiki")); var outFile = Path.Combine(Path.GetDirectoryName(inFileName), langCode + "_" + Path.GetFileNameWithoutExtension(inFileName) + "_parsed.txt"); var wikiformat = !inFileName.Contains("abst") ? "content" : "abstract"; //var wikiformat = "abstract"; long absCnt = 0; if (wikiformat == "content") { using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false))) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { for (; ;) { var text = rd.Read(); if (text == null) { break; } wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } else { using (var rd = new XmlTextReader(inFileName)) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { while (rd.Read()) { if (rd.IsStartElement("abstract") && !rd.IsEmptyElement) { var text = rd.ReadElementString("abstract"); wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } } Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt); } }
public static void ProcessCmdline(string[] args) { if (args.Length != 2 || (args[1] != "abstract" && args[1] != "content")) { Console.Error.WriteLine("Usage: ParseWikipedia xml-files.txt [abstract|content]"); } else { var inFileNames = File.ReadAllLines(args[0]); foreach (var inFileName in inFileNames) { var langCode = inFileName.Substring(0, inFileName.IndexOf("wiki")); var outFile = langCode + "_parsed.txt"; long absCnt = 0; if (args[1] == "content") { using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false))) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { for (; ;) { var text = rd.Read(); if (text == null) { break; } wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } else { using (var rd = new XmlTextReader(inFileName)) { using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8)) { while (rd.Read()) { if (rd.IsStartElement("abstract") && !rd.IsEmptyElement) { var text = rd.ReadElementString("abstract"); wr.WriteLine(Cleaning.CleanWiki(text)); absCnt++; } } } } } Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt); } } }