コード例 #1
0
        public static void ProcessInline()
        {
            //string[] args = new string[]
            //{
            //    "content"
            //};
            //var inFileNames = File.ReadAllLines(args[0]);
            //var inFileNames = new string[]
            //{

            //};
            foreach (var inFileName in Directory.EnumerateFiles(Path.Combine(Directory.GetCurrentDirectory(), "Data"), "*", SearchOption.AllDirectories))
            //foreach (var inFileName in inFileNames)
            {
                var langCode   = Path.GetFileNameWithoutExtension(inFileName).Substring(0, Path.GetFileNameWithoutExtension(inFileName).IndexOf("wiki"));
                var outFile    = Path.Combine(Path.GetDirectoryName(inFileName), langCode + "_" + Path.GetFileNameWithoutExtension(inFileName) + "_parsed.txt");
                var wikiformat = !inFileName.Contains("abst") ? "content" : "abstract";
                //var wikiformat = "abstract";
                long absCnt = 0;
                if (wikiformat == "content")
                {
                    using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false)))
                    {
                        using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                        {
                            for (; ;)
                            {
                                var text = rd.Read();
                                if (text == null)
                                {
                                    break;
                                }
                                wr.WriteLine(Cleaning.CleanWiki(text));
                                absCnt++;
                            }
                        }
                    }
                }
                else
                {
                    using (var rd = new XmlTextReader(inFileName))
                    {
                        using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                        {
                            while (rd.Read())
                            {
                                if (rd.IsStartElement("abstract") && !rd.IsEmptyElement)
                                {
                                    var text = rd.ReadElementString("abstract");
                                    wr.WriteLine(Cleaning.CleanWiki(text));
                                    absCnt++;
                                }
                            }
                        }
                    }
                }
                Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt);
            }
        }
コード例 #2
0
 public static void ProcessCmdline(string[] args)
 {
     if (args.Length != 2 || (args[1] != "abstract" && args[1] != "content"))
     {
         Console.Error.WriteLine("Usage: ParseWikipedia xml-files.txt [abstract|content]");
     }
     else
     {
         var inFileNames = File.ReadAllLines(args[0]);
         foreach (var inFileName in inFileNames)
         {
             var  langCode = inFileName.Substring(0, inFileName.IndexOf("wiki"));
             var  outFile  = langCode + "_parsed.txt";
             long absCnt   = 0;
             if (args[1] == "content")
             {
                 using (var rd = new XmlContentReader(new StreamReader(inFileName, System.Text.Encoding.UTF8, false)))
                 {
                     using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                     {
                         for (; ;)
                         {
                             var text = rd.Read();
                             if (text == null)
                             {
                                 break;
                             }
                             wr.WriteLine(Cleaning.CleanWiki(text));
                             absCnt++;
                         }
                     }
                 }
             }
             else
             {
                 using (var rd = new XmlTextReader(inFileName))
                 {
                     using (var wr = new StreamWriter(outFile, false, System.Text.Encoding.UTF8))
                     {
                         while (rd.Read())
                         {
                             if (rd.IsStartElement("abstract") && !rd.IsEmptyElement)
                             {
                                 var text = rd.ReadElementString("abstract");
                                 wr.WriteLine(Cleaning.CleanWiki(text));
                                 absCnt++;
                             }
                         }
                     }
                 }
             }
             Console.Error.WriteLine("Done with {0}. Wrote {1} docs.", langCode, absCnt);
         }
     }
 }