private static async Task Main(string[] args) { var mongoClient = new MongoClient(MongoDbConnString); var storage = new MongoWikiDownloadStorage(mongoClient); var titles = await storage.GetAllPageTitlesAsync(); var nodeKeysInfo = NodeKeysInfoCalculator.Build(titles); File.WriteAllText(NodeKeysInfoOutputFileName, nodeKeysInfo.ToJson()); var edges = new Dictionary <string, string[]>(); foreach (var title in nodeKeysInfo.Regular) { var content = await storage.GetPageContent(title); edges[title] = ExtractReferences(content) .Select(x => MapReference(nodeKeysInfo, x)) .Where(x => x != null) .Distinct() .ToArray(); } File.WriteAllText(EdgesOutputFileName, edges.ToJson()); }
private static async Task Main() { var mongoClient = new MongoClient(MongoDbConnString); var storage = new MongoWikiDownloadStorage(mongoClient); var links = await File.ReadAllLinesAsync(ArticleListFileName); var titles = await storage.GetAllPageTitlesAsync(); var titlesDict = titles.ToDictionary(x => x.Name); var refs = new List <string>(); var errors = new List <string>(); var pages = new Dictionary <string, string>(); foreach (var link in links) { var name = link.Replace("_", " "); if (!titlesDict.ContainsKey(name)) { errors.Add(link); continue; } var title = titlesDict[name]; var content = await storage.GetPageContent(title.ReferenceName ?? title.Name); if (content == null) { errors.Add(link); continue; } var document = new HtmlDocument(); document.LoadHtml(content); pages[link] = document.DocumentNode.InnerText; } var extractInfo = new { errors, refs }; await File.WriteAllTextAsync(ExtractInfoOutputFileName, JsonConvert.SerializeObject(extractInfo)); await File.WriteAllTextAsync(ExtractedPagesOutputFileName, JsonConvert.SerializeObject(pages)); }