private static void GetMotorcycleArticles() { var wm = new Wikimedia("enwiki-20160701-pages-articles-multistream.xml"); var articles = wm.Articles .Where(article => article.Text.Contains("{{Infobox Motorcycle")); Wikimedia.WriteToDisk(articles, "motorcycles.dat"); }
private static void Main(string[] args) { if (!ParseArgs(args)) { Console.WriteLine("NGrams.exe [-in input.(xml|dat)] [-db frequencies.db] [-out frequencies.txt]"); Console.WriteLine("At least two of the input files must be present:"); Console.WriteLine(" -in specifies the Wikipedia plaintext dump location"); Console.WriteLine(" -db specifies the location of processed n-gram frequencies."); Console.WriteLine(" When used with -in, the input file will be processed into -db"); Console.WriteLine(" When used without -in, assume frequencies exist and read from this db"); Console.WriteLine(" -out specifies the plaintext TSV that should contain the database dump"); return; } if (string.IsNullOrEmpty(inputFile) && !string.IsNullOrEmpty(dbFile)) { // process existing DB into frequency counts DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile); } else { var wm = new Wikimedia(inputFile); var articles = wm.Articles .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); if (articleLimit > 0) { articles = articles.Take(articleLimit); } if (string.IsNullOrEmpty(dbFile)) { // don't care about the db; just use a temp file dbFile = Path.GetTempFileName(); } Console.WriteLine("Beginning n-gram calculation"); var startTime = DateTime.Now; WordFrequency.CalculateNGramFrequencies(articles, dbFile, nGramSize); var endTime = DateTime.Now; Console.WriteLine("Calculation took " + (endTime - startTime)); if (!string.IsNullOrEmpty(outputFile)) { Console.WriteLine("Writing frequencies to disk"); startTime = DateTime.Now; DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile); endTime = DateTime.Now; Console.WriteLine("Dump to disk took " + (endTime - startTime)); } } }
static void Main(string[] args) { if (!ParseArgs(args)) { Usage(); return; } var startTime = DateTime.Now; var wm = new Wikimedia(inputFile); IEnumerable<WikimediaPage> articles = wm.Articles .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); if (!string.IsNullOrEmpty(title)) { var targetArticle = articles.FirstOrDefault( article => article.Title.Equals(title, StringComparison.CurrentCultureIgnoreCase)); if (targetArticle == null) { Console.WriteLine("Could not find article '{0}'", title); return; } File.WriteAllText(outputFile, rawText ? targetArticle.Text : targetArticle.Plaintext); } else { if (articleLimit > 0) { articles = articles.Take(articleLimit); } var numberOfArticles = Wikimedia.WriteToDisk(articles, outputFile); Console.WriteLine("Wrote {0} articles to disk.", numberOfArticles); } var endTime = DateTime.Now; TimeSpan processTime = endTime - startTime; Console.WriteLine("Process took " + processTime); }