예제 #1
0
        static void Main()
        {
            DownloadStream.Log      = true;
            CachingStream.CachePath = @"C:\Wikipedia dumps\";

            var networks = Networks.Instance;

            foreach (var lang in DumpsManager.Wikipedias)
            {
                string dumpName = lang.Replace('-', '_') + "wiki";

                Console.Out.Log(string.Format("Processing {0}.", dumpName));

                DateTime date = DumpsManager.GetLastDumpDate(dumpName);

                var langLinks =
                    LangLinks.Instance.Get(dumpName, date).Where(ll => ll.From != null && ll.From.NamespaceId == 0); // only articles
                networks.ProcessDump(lang, langLinks);
            }

            var largestNonEnwikiNetworks = (from root in networks.Roots
                                            where !root.Children.Any(p => p.Language == "en")
                                            let languageCount = root.Children.Select(p => p.Language).Distinct().Count()
                                                                orderby languageCount descending
                                                                select new { root.Children, LanguageCount = languageCount }).Take(100);

            string fileName = "networks without enwiki.txt";

            using (var writer = new StreamWriter(fileName))
            {
                writer.WriteLine("{| class=\"wikitable\"");
                writer.WriteLine("|-");
                writer.WriteLine("! {0} !! {1} !! {2}", "No.", "Articles", "Count");

                int i = 0;

                foreach (var network in largestNonEnwikiNetworks)
                {
                    var articleLinks = from page in network.Children
                                       orderby page.Language, page.Title
                    select string.Format("[[:{0}:{1}]]", page.Language, page.Title);

                    var articleLinksString = string.Join(", ", articleLinks);

                    writer.WriteLine("|-");
                    writer.WriteLine("| {0} || {1} || {2}", ++i, articleLinksString, network.LanguageCount);
                }

                writer.WriteLine("|}");
            }
        }
        static void Main()
        {
            // path, where the dumps will be downloaded
            CachingStream.CachePath = @"C:\Wikipedia dumps";

            // we won't need other pages, so there's no need to load them into memory
            Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article);

            var pageLinks = PageLinks.Instance.Get("enwiki", DumpsManager.GetLastDumpDate("enwiki"));

            var articleToArticleLinks =
                pageLinks.Where(
                    pl => pl.From != null && // because of page limiter above, this will give only links from articles
                    pl.ToNamespace == Namespaces.Article);          // only links to articles

            foreach (var link in articleToArticleLinks)
            {
                Console.WriteLine("{0}->{1}", link.From.Title, link.ToTitle);
            }
        }
        static void Main(string[] args)
        {
            Console.Write("Cache path [{0}]: ", Settings.Default.CachePath);
            string cachePath = Console.ReadLine();

            if (string.IsNullOrWhiteSpace(cachePath))
            {
                cachePath = Settings.Default.CachePath;
            }
            Settings.Default.CachePath = cachePath;
            CachingStream.CachePath    = cachePath;
            Console.Write("Wiki [{0}]: ", Settings.Default.Wiki);
            string wiki = Console.ReadLine();

            if (string.IsNullOrWhiteSpace(wiki))
            {
                wiki = Settings.Default.Wiki;
            }
            Settings.Default.Wiki = wiki;
            Console.Write("Root category [{0}]: ", Settings.Default.RootCategory);
            string rootCategory = Console.ReadLine();

            if (string.IsNullOrWhiteSpace(rootCategory))
            {
                rootCategory = Settings.Default.RootCategory;
            }
            Settings.Default.RootCategory = rootCategory;
            var defaultDate = DumpsManager.GetLastDumpDate(wiki).ToString("yyyMMdd");

            Console.Write("Date [{0}]: ", defaultDate);
            string dateString = Console.ReadLine();

            if (string.IsNullOrWhiteSpace(dateString))
            {
                dateString = defaultDate;
            }
            Settings.Default.Save();

            DownloadStream.Log = true;

            PlaintextFile = string.Format("{0}-{1}-cycles.txt", wiki, dateString);
            File.Delete(PlaintextFile);
            WikitextFile = string.Format("{0}-{1}-cycles.wiki", wiki, dateString);
            File.Delete(WikitextFile);

            DateTime date = DateTime.ParseExact(dateString, "yyyyMMdd", System.Globalization.CultureInfo.InvariantCulture);

            Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Category);

            var categories = new Dictionary <string, Category>();

            foreach (var categoryLink in CategoryLinks.Instance.Get(wiki, date))
            {
                Page fromPage = categoryLink.From;
                if (fromPage == null)
                {
                    continue;
                }

                string fromTitle = fromPage.Title;
                if (!categories.ContainsKey(fromTitle))
                {
                    categories.Add(fromTitle, new Category(fromTitle));
                }
                string toTitle = categoryLink.ToTitle;
                if (!categories.ContainsKey(toTitle))
                {
                    categories.Add(toTitle, new Category(toTitle));
                }
                categories[toTitle].Children.Add(categories[fromTitle]);
            }

            stack = new Stack <Tuple <Category, Queue <Category> > >();

            stack.Push(new Tuple <Category, Queue <Category> >(categories[rootCategory], new Queue <Category>(categories[rootCategory].Children)));

            while (stack.Count > 0)
            {
                var currentCategory = stack.Peek().Item1;
                var queue           = stack.Peek().Item2;
                if (!queue.Any())
                {
                    currentCategory.Closed = true;
                    stack.Pop();
                }
                else
                {
                    var toAdd = queue.Dequeue();
                    if (stack.Any(t => t.Item1 == toAdd))
                    {
                        ReportCycle(toAdd);
                        currentCategory.Children.Remove(toAdd);
                    }
                    else
                    {
                        if (!toAdd.Closed)
                        {
                            stack.Push(new Tuple <Category, Queue <Category> >(toAdd, new Queue <Category>(toAdd.Children)));
                        }
                    }
                }
            }
        }