static void Main() { DownloadStream.Log = true; CachingStream.CachePath = @"C:\Wikipedia dumps\"; var networks = Networks.Instance; foreach (var lang in DumpsManager.Wikipedias) { string dumpName = lang.Replace('-', '_') + "wiki"; Console.Out.Log(string.Format("Processing {0}.", dumpName)); DateTime date = DumpsManager.GetLastDumpDate(dumpName); var langLinks = LangLinks.Instance.Get(dumpName, date).Where(ll => ll.From != null && ll.From.NamespaceId == 0); // only articles networks.ProcessDump(lang, langLinks); } var largestNonEnwikiNetworks = (from root in networks.Roots where !root.Children.Any(p => p.Language == "en") let languageCount = root.Children.Select(p => p.Language).Distinct().Count() orderby languageCount descending select new { root.Children, LanguageCount = languageCount }).Take(100); string fileName = "networks without enwiki.txt"; using (var writer = new StreamWriter(fileName)) { writer.WriteLine("{| class=\"wikitable\""); writer.WriteLine("|-"); writer.WriteLine("! {0} !! {1} !! {2}", "No.", "Articles", "Count"); int i = 0; foreach (var network in largestNonEnwikiNetworks) { var articleLinks = from page in network.Children orderby page.Language, page.Title select string.Format("[[:{0}:{1}]]", page.Language, page.Title); var articleLinksString = string.Join(", ", articleLinks); writer.WriteLine("|-"); writer.WriteLine("| {0} || {1} || {2}", ++i, articleLinksString, network.LanguageCount); } writer.WriteLine("|}"); } }
static void Main() { // path, where the dumps will be downloaded CachingStream.CachePath = @"C:\Wikipedia dumps"; // we won't need other pages, so there's no need to load them into memory Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Article); var pageLinks = PageLinks.Instance.Get("enwiki", DumpsManager.GetLastDumpDate("enwiki")); var articleToArticleLinks = pageLinks.Where( pl => pl.From != null && // because of page limiter above, this will give only links from articles pl.ToNamespace == Namespaces.Article); // only links to articles foreach (var link in articleToArticleLinks) { Console.WriteLine("{0}->{1}", link.From.Title, link.ToTitle); } }
static void Main(string[] args) { Console.Write("Cache path [{0}]: ", Settings.Default.CachePath); string cachePath = Console.ReadLine(); if (string.IsNullOrWhiteSpace(cachePath)) { cachePath = Settings.Default.CachePath; } Settings.Default.CachePath = cachePath; CachingStream.CachePath = cachePath; Console.Write("Wiki [{0}]: ", Settings.Default.Wiki); string wiki = Console.ReadLine(); if (string.IsNullOrWhiteSpace(wiki)) { wiki = Settings.Default.Wiki; } Settings.Default.Wiki = wiki; Console.Write("Root category [{0}]: ", Settings.Default.RootCategory); string rootCategory = Console.ReadLine(); if (string.IsNullOrWhiteSpace(rootCategory)) { rootCategory = Settings.Default.RootCategory; } Settings.Default.RootCategory = rootCategory; var defaultDate = DumpsManager.GetLastDumpDate(wiki).ToString("yyyMMdd"); Console.Write("Date [{0}]: ", defaultDate); string dateString = Console.ReadLine(); if (string.IsNullOrWhiteSpace(dateString)) { dateString = defaultDate; } Settings.Default.Save(); DownloadStream.Log = true; PlaintextFile = string.Format("{0}-{1}-cycles.txt", wiki, dateString); File.Delete(PlaintextFile); WikitextFile = string.Format("{0}-{1}-cycles.wiki", wiki, dateString); File.Delete(WikitextFile); DateTime date = DateTime.ParseExact(dateString, "yyyyMMdd", System.Globalization.CultureInfo.InvariantCulture); Pages.Instance.Limiter = pages => pages.Where(p => p.Namespace == Namespaces.Category); var categories = new Dictionary <string, Category>(); foreach (var categoryLink in CategoryLinks.Instance.Get(wiki, date)) { Page fromPage = categoryLink.From; if (fromPage == null) { continue; } string fromTitle = fromPage.Title; if (!categories.ContainsKey(fromTitle)) { categories.Add(fromTitle, new Category(fromTitle)); } string toTitle = categoryLink.ToTitle; if (!categories.ContainsKey(toTitle)) { categories.Add(toTitle, new Category(toTitle)); } categories[toTitle].Children.Add(categories[fromTitle]); } stack = new Stack <Tuple <Category, Queue <Category> > >(); stack.Push(new Tuple <Category, Queue <Category> >(categories[rootCategory], new Queue <Category>(categories[rootCategory].Children))); while (stack.Count > 0) { var currentCategory = stack.Peek().Item1; var queue = stack.Peek().Item2; if (!queue.Any()) { currentCategory.Closed = true; stack.Pop(); } else { var toAdd = queue.Dequeue(); if (stack.Any(t => t.Item1 == toAdd)) { ReportCycle(toAdd); currentCategory.Children.Remove(toAdd); } else { if (!toAdd.Closed) { stack.Push(new Tuple <Category, Queue <Category> >(toAdd, new Queue <Category>(toAdd.Children))); } } } } }