Ejemplo n.º 1
0
        private static void GetMotorcycleArticles()
        {
            var wm = new Wikimedia("enwiki-20160701-pages-articles-multistream.xml");

            var articles = wm.Articles
                .Where(article => article.Text.Contains("{{Infobox Motorcycle"));

            Wikimedia.WriteToDisk(articles, "motorcycles.dat");
        }
Ejemplo n.º 2
0
        private static void Main(string[] args)
        {
            if (!ParseArgs(args))
            {
                Console.WriteLine("NGrams.exe [-in input.(xml|dat)] [-db frequencies.db] [-out frequencies.txt]");
                Console.WriteLine("At least two of the input files must be present:");
                Console.WriteLine("   -in specifies the Wikipedia plaintext dump location");
                Console.WriteLine("   -db specifies the location of processed n-gram frequencies.");
                Console.WriteLine("        When used with -in, the input file will be processed into -db");
                Console.WriteLine("        When used without -in, assume frequencies exist and read from this db");
                Console.WriteLine("   -out specifies the plaintext TSV that should contain the database dump");
                return;
            }

            if (string.IsNullOrEmpty(inputFile) && !string.IsNullOrEmpty(dbFile))
            {
                // process existing DB into frequency counts
                DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile);
            }
            else
            {
                var wm = new Wikimedia(inputFile);
                var articles = wm.Articles
                        .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);

                if (articleLimit > 0)
                {
                    articles = articles.Take(articleLimit);
                }

                if (string.IsNullOrEmpty(dbFile))
                {
                    // don't care about the db; just use a temp file
                    dbFile = Path.GetTempFileName();
                }

                Console.WriteLine("Beginning n-gram calculation");
                var startTime = DateTime.Now;
                WordFrequency.CalculateNGramFrequencies(articles, dbFile, nGramSize);
                var endTime = DateTime.Now;

                Console.WriteLine("Calculation took " + (endTime - startTime));

                if (!string.IsNullOrEmpty(outputFile))
                {
                    Console.WriteLine("Writing frequencies to disk");

                    startTime = DateTime.Now;
                    DumpFrequenciesToDisk(WordFrequency.GetNGramFrequencies(dbFile, cutoff), outputFile);
                    endTime = DateTime.Now;

                    Console.WriteLine("Dump to disk took " + (endTime - startTime));
                }
            }
        }
Ejemplo n.º 3
0
        static void Main(string[] args)
        {
            if (!ParseArgs(args))
            {
                Usage();
                return;
            }

            var startTime = DateTime.Now;

            var wm = new Wikimedia(inputFile);

            IEnumerable<WikimediaPage> articles = wm.Articles
                    .Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);

            if (!string.IsNullOrEmpty(title))
            {
                var targetArticle =
                    articles.FirstOrDefault(
                        article => article.Title.Equals(title, StringComparison.CurrentCultureIgnoreCase));

                if (targetArticle == null)
                {
                    Console.WriteLine("Could not find article '{0}'", title);
                    return;
                }

                File.WriteAllText(outputFile, rawText ? targetArticle.Text : targetArticle.Plaintext);
            }
            else
            {
                if (articleLimit > 0)
                {
                    articles = articles.Take(articleLimit);
                }

                var numberOfArticles = Wikimedia.WriteToDisk(articles, outputFile);
                Console.WriteLine("Wrote {0} articles to disk.", numberOfArticles);
            }

            var endTime = DateTime.Now;

            TimeSpan processTime = endTime - startTime;
            Console.WriteLine("Process took " + processTime);
        }