private IEnumerable <(string Id, string Title, WikiPage ZhPage)> GetCatsToProcess(ICollection <string> processed) { var cats = GetCats(); return(cats.Select(c => (id: CPRepository.StripEntityUri(((IUriNode)c.Value("cat")).Uri), title: c.Value("title").AsValuedNode().AsString())) .Where(t => !processed.Contains(t.id)) .Select(t => (t.id, t.title, page: new WikiPage(zhWarriorsSite, t.title)))); }
private SparqlResultSet GetCats() { return(CPRepository.ExecuteQuery(@" SELECT ?cat ?title { ?cat wdt:P3 wd:Q622. ?link schema:isPartOf <https://warriors.huijiwiki.com/>; schema:about ?cat; schema:name ?title. }")); }
public async Task RunAsync() { await zhWarriorsSite.Initialization; var gen = new CategoryMembersGenerator(zhWarriorsSite, "没有图片的猫物") { PaginationSize = 50, MemberTypes = CategoryMemberTypes.Page, }; var enu = gen.EnumPagesAsync(PageQueryOptions.FetchContent); var fixedPages = @" 焦风 微光毛_(黑莓星的风暴) ".Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries).Select(t => new WikiPage(zhWarriorsSite, t)).ToList(); await fixedPages.RefreshAsync(PageQueryOptions.FetchContent | PageQueryOptions.ResolveRedirects); enu = fixedPages.ToAsyncEnumerable(); var counter = 0; await foreach (var page in enu) { counter++; var query = CPRepository.CreateQuery(@" SELECT ?link { ?link schema:isPartOf <https://warriors.huijiwiki.com/>; schema:name @title. }"); query.SetLiteral("title", page.Title, "zh"); if (CPRepository.ExecuteQuery(query).Any()) { Logger.LogWarning("Exists {}", page); continue; } Logger.LogInformation("[{}] Processing {}", counter, page); RETRY: try { await ExportEntityAsync(page); } catch (WikiClientException ex) { Console.WriteLine(ex); Console.ReadKey(); await page.RefreshAsync(PageQueryOptions.FetchContent); goto RETRY; } } }
public async Task RunAsync() { await zhWarriorsSite.Initialization; var gen = new CategoryMembersGenerator(zhWarriorsSite, "猫物") { PaginationSize = 50, MemberTypes = CategoryMemberTypes.Page, }; var counter = 0; using (var ie = gen.EnumPagesAsync(PageQueryOptions.FetchContent).GetEnumerator()) { while (await ie.MoveNext()) { counter++; var page = ie.Current; var query = CPRepository.CreateQuery(@" SELECT ?link { ?link schema:isPartOf <https://warriors.huijiwiki.com/>; schema:name @title. }"); query.SetLiteral("title", page.Title, "zh"); if (CPRepository.ExecuteQuery(query).Any()) { Logger.LogWarning("Exists {}", page); continue; } Logger.LogInformation("[{}] Processing {}", counter, page); RETRY: try { await ExportEntityAsync(page); } catch (WikiClientException ex) { Console.WriteLine(ex); Console.ReadKey(); await page.RefreshAsync(PageQueryOptions.FetchContent); goto RETRY; } } } }
public async Task RunAsync() { var enWw = new WikiSite(Site.WikiClient, "http://warriors.wikia.com/api.php"); // var books = CPRepository.ExecuteQuery(@" //SELECT ?book ?link { // { ?book wdt:P3 wd:Q46. } UNION { ?book wdt:P3 wd:Q116. } // ?link schema:isPartOf <http://warriors.wikia.com/>; // schema:about ?book. //}") // .Select(r => (book: (UriNode)r["book"], link: (UriNode)r["link"])) // .ToList(); var books = CPRepository.ExecuteQuery(@" SELECT ?book ?label { { ?book wdt:P3 wd:Q46. } UNION { ?book wdt:P3 wd:Q116. } ?book rdfs:label ?label. FILTER (lang(?label) = 'en') }") .Select(r => (id: CPRepository.StripEntityUri(((UriNode)r["book"]).Uri), label: ((LiteralNode)r["label"]).Value)); await enWw.Initialization; foreach (var book in books) { string lastChapterId = null; var bookItem = new Entity(Site, book.id); var tlabel = book.label; RETRY: var parsingTask = enWw.ParseContentAsync("{{Chapters/b|" + tlabel + "}}", null, null, ParsingOptions.None); await bookItem.RefreshAsync(EntityQueryOptions.FetchLabels | EntityQueryOptions.FetchAliases, new[] { "en", "zh-cn", "zh-tw" }); var labelEn = bookItem.Labels["en"]; var labelCn = bookItem.Labels["zh-cn"] ?? labelEn; var labelTw = bookItem.Labels["zh-tw"] ?? labelCn; Logger.LogInformation("{}, {}, {}", labelEn, labelCn, labelTw); var doc = new HtmlDocument(); doc.LoadHtml((await parsingTask).Content); var nodes = doc.DocumentNode.SelectNodes("//a[@href]"); if (nodes == null) { if (!tlabel.Contains('(')) { tlabel += " (Book)"; goto RETRY; } Logger.LogError("No chapter information found."); continue; } foreach (var node in nodes) { var chLabels = new WbMonolingualTextCollection(); var chAliases = new WbMonolingualTextsCollection(); var chDescriptions = new WbMonolingualTextCollection(); var text = node.InnerText.Trim(); var n = TryMatchChapterNumber(text); var nId = n?.ToString(); if (n == null) { switch (text.ToLowerInvariant()) { case "prologue": chLabels["en"] = labelEn + ", Prologue"; chDescriptions["en"] = "prologue chapter of " + labelEn; foreach (var a in bookItem.Aliases["en"]) { chAliases.Add("en", a + "-0"); } chLabels["zh-cn"] = "《" + labelCn + "》引子"; chDescriptions["zh-cn"] = "《" + labelCn + "》的引子章节"; chLabels["zh-tw"] = "《" + labelTw + "》序章"; chDescriptions["zh-tw"] = "《" + labelTw + "》的序章"; chAliases.Add("zh-cn", labelCn + " 引子"); chAliases.Add("zh-tw", labelTw + " 序章"); chAliases.Add("zh-cn", labelCn + " 0"); chAliases.Add("zh-tw", labelTw + " 0"); nId = "0"; break; case "epilogue": chLabels["en"] = labelEn + ", Epilogue"; chDescriptions["en"] = "epilogue chapter of " + labelEn; foreach (var a in bookItem.Aliases["en"]) { chAliases.Add("en", a + "-E"); } chLabels["zh-cn"] = "《" + labelCn + "》尾声"; chDescriptions["zh-cn"] = "《" + labelCn + "》的尾声章节"; chLabels["zh-tw"] = "《" + labelTw + "》尾聲"; chDescriptions["zh-tw"] = "《" + labelTw + "》的尾聲章節"; chAliases.Add("zh-cn", labelCn + " 尾声"); chAliases.Add("zh-tw", labelTw + " 尾聲"); nId = "E"; break; default: chLabels["en"] = labelEn + ", " + text; chDescriptions["en"] = "a chapter of " + labelEn; var abbr = GetAbbr(text); foreach (var a in bookItem.Aliases["en"]) { chAliases.Add("en", a + "-" + abbr); } chLabels["zh-cn"] = "《" + labelCn + "》" + text; chDescriptions["zh-cn"] = "《" + labelCn + "》的一个章节"; chLabels["zh-tw"] = "《" + labelTw + "》" + text; chDescriptions["zh-tw"] = "《" + labelTw + "》的一個章節"; chAliases.Add("zh-cn", labelCn + " " + abbr); chAliases.Add("zh-tw", labelTw + " " + abbr); break; } } else { chLabels["en"] = labelEn + ", Chapter " + n; chDescriptions["en"] = "Chapter " + n + " of " + labelEn; foreach (var a in bookItem.Aliases["en"]) { chAliases.Add("en", a + "-" + n); } var zhOrdinal = Utility.GetOrdinalZh(n.Value); chLabels["zh-cn"] = "《" + labelCn + "》第" + zhOrdinal + "章"; chDescriptions["zh-cn"] = "《" + labelCn + "》的第" + zhOrdinal + "章"; chLabels["zh-tw"] = "《" + labelTw + "》第" + zhOrdinal + "章"; chDescriptions["zh-tw"] = "《" + labelTw + "》的第" + zhOrdinal + "章"; chAliases.Add("zh-cn", labelCn + " " + n); chAliases.Add("zh-tw", labelTw + " " + n); } string cid = null; if ((cid = CPRepository.EntityFromLabel(chLabels["en"])) != null) { Logger.LogWarning("Entity exists."); lastChapterId = cid; if (labelEn.Contains("Hollyleaf's Story")) { continue; } break; } if (labelEn == labelCn) { chLabels.Remove("zh-cn"); chAliases.Remove("zh-cn"); chDescriptions.Remove("zh-cn"); } if (labelEn == labelTw || labelCn == labelTw) { chLabels.Remove("zh-tw"); chAliases.Remove("zh-tw"); chDescriptions.Remove("zh-tw"); } //foreach (var l in chLabels) Console.WriteLine(l); //foreach (var l in chAliases) Console.WriteLine(l); //foreach (var l in chDescriptions) Console.WriteLine(l); var claims = new List <Claim> { new Claim("P3", "Q109", BuiltInDataTypes.WikibaseItem), }; { var c = new Claim("P50", book.id, BuiltInDataTypes.WikibaseItem); if (nId != null) { c.Qualifiers.Add(new Snak("P53", nId, BuiltInDataTypes.String)); } if (lastChapterId != null) { c.Qualifiers.Add(new Snak("P48", lastChapterId, BuiltInDataTypes.WikibaseItem)); } claims.Add(c); } var chEntity = new Entity(Site, EntityType.Item); var edits = new List <EntityEditEntry>(); edits.AddRange(chLabels.Select(l => new EntityEditEntry(nameof(chEntity.Labels), l))); edits.AddRange(chAliases.Select(l => new EntityEditEntry(nameof(chEntity.Aliases), l))); edits.AddRange(chDescriptions.Select(l => new EntityEditEntry(nameof(chEntity.Descriptions), l))); edits.AddRange(claims.Select(c => new EntityEditEntry(nameof(chEntity.Claims), c))); if (!node.HasClass("new")) { var title = WebUtility.UrlDecode(node.GetAttributeValue("href", "").Replace("/wiki/", "")); edits.Add(new EntityEditEntry(nameof(chEntity.SiteLinks), new EntitySiteLink("enwarriorswiki", title))); } await chEntity.EditAsync(edits, "Populate chapter.", EntityEditOptions.Bulk | EntityEditOptions.Bot); lastChapterId = chEntity.Id; } } }
public async Task PopulateRelationsAsync() { var processedEntities = GetProcessedEntities(); await zhWarriorsSite.Initialization; var counter = 0; foreach (var catg in GetCatsToProcess(processedEntities).Buffer(50)) { await catg.Select(t => t.ZhPage).RefreshAsync(PageQueryOptions.FetchContent); foreach (var(id, title, page) in catg) { counter++; Logger.LogInformation("[{}] Processing {} -> {}", counter, title, id); try { await EditEntityAsync(new Entity(Site, id), page); processedEntities.Add(id); } catch (KeyNotFoundException) { Logger.LogWarning("Missing entity."); } WriteProcessedEntities(processedEntities); } } async Task EditEntityAsync(Entity entity, WikiPage page) { var root = parser.Parse(page.Content); var infobox = root.EnumDescendants().TemplatesWithTitle("Infobox cat").FirstOrDefault(); if (infobox == null) { Logger.LogError("No {{Infobox cat}} found."); return; } var father = infobox.Arguments["father"]?.Value.EnumDescendants().OfType <WikiLink>().FirstOrDefault()?.Target.ToPlainText(); var mother = infobox.Arguments["mother"]?.Value.EnumDescendants().OfType <WikiLink>().FirstOrDefault()?.Target.ToPlainText(); var mates = infobox.Arguments["mate"]?.Value.EnumDescendants().OfType <WikiLink>().Select(l => l.Target.ToPlainText()).ToList(); var fosters = infobox.Arguments["foster_father"]?.Value.EnumDescendants() .Concat(infobox.Arguments["foster_mother"]?.Value.EnumDescendants() ?? Enumerable.Empty <Node>()) .OfType <WikiLink>().Select(l => l.Target.ToPlainText()).ToList(); var mentors = infobox.Arguments["mentor"]?.Value.EnumDescendants().OfType <WikiLink>().Select(l => l.Target.ToPlainText()).ToList(); Console.WriteLine(father); Console.WriteLine(mother); Console.WriteLine(string.Join(";", mates)); Console.WriteLine(string.Join(";", fosters)); Console.WriteLine(string.Join(";", mentors)); var claims = new List <Claim>(); if (father != null) { var f = CPRepository.EntityFromZhSiteLink(father); if (f == null) { WriteMissingEntity(father); throw new KeyNotFoundException(); } claims.Add(new Claim("P88", f, BuiltInDataTypes.WikibaseItem)); } if (mother != null) { var m = CPRepository.EntityFromZhSiteLink(mother); if (m == null) { WriteMissingEntity(mother); throw new KeyNotFoundException(); } claims.Add(new Claim("P89", m, BuiltInDataTypes.WikibaseItem)); } if (fosters != null) { foreach (var foster in fosters) { var f = CPRepository.EntityFromZhSiteLink(foster); if (f == null) { WriteMissingEntity(foster); throw new KeyNotFoundException(); } claims.Add(new Claim("P99", f, BuiltInDataTypes.WikibaseItem)); } } if (mates != null) { var index = 1; foreach (var mate in mates) { var f = CPRepository.EntityFromZhSiteLink(mate); if (f == null) { WriteMissingEntity(mate); throw new KeyNotFoundException(); } claims.Add(new Claim("P100", f, BuiltInDataTypes.WikibaseItem) { Qualifiers = { new Snak("P53", index.ToString(), BuiltInDataTypes.String) } }); index++; } } if (mentors != null) { foreach (var mentor in mentors) { var f = CPRepository.EntityFromZhSiteLink(mentor); if (f == null) { WriteMissingEntity(mentor); throw new KeyNotFoundException(); } claims.Add(new Claim("P86", f, BuiltInDataTypes.WikibaseItem)); } } if (claims.Any()) { await entity.EditAsync(claims.Select(c => new EntityEditEntry(nameof(entity.Claims), c)), "Populate relations from zhwarriorswiki.", EntityEditOptions.Bot); } } }
public async Task PopulateAffiliationsAsync() { var processedEntities = GetProcessedEntities(); await zhWarriorsSite.Initialization; var counter = 0; foreach (var catg in GetCatsToProcess(processedEntities).Buffer(50)) { await catg.Select(t => t.ZhPage).RefreshAsync(PageQueryOptions.FetchContent); foreach (var(id, title, page) in catg) { counter++; Logger.LogInformation("[{}] Processing {} -> {}", counter, title, id); try { await EditEntityAsync(new Entity(Site, id), page); processedEntities.Add(id); } catch (KeyNotFoundException) { Logger.LogWarning("Missing entity."); } WriteProcessedEntities(processedEntities); } async Task EditEntityAsync(Entity entity, WikiPage page) { var root = parser.Parse(page.Content); var infobox = root.EnumDescendants().TemplatesWithTitle("Infobox cat").FirstOrDefault(); if (infobox == null) { Logger.LogError("No {{Infobox cat}} found."); return; } var pastAff = await ExtractAffiliationsEx(infobox.Arguments["past_affiliation"]?.Value); var curAff = await ExtractAffiliationsEx(infobox.Arguments["current_affiliation"]?.Value); var claims = new List <Claim>(); foreach (var(AffId, PosId) in pastAff.Concat(curAff)) { Claim c; if (AffId != null) { c = new Claim("P76", AffId, BuiltInDataTypes.WikibaseItem); } else { c = new Claim(new Snak("P76", SnakType.SomeValue)); } if (PosId != null) { c.Qualifiers.Add(new Snak("P92", PosId, BuiltInDataTypes.WikibaseItem)); } Logger.LogInformation("Affiliation: {}, Pos: {}", CPRepository.LabelFromEntity(AffId, "en"), PosId == null ? null : CPRepository.LabelFromEntity(PosId, "en")); claims.Add(c); } if (claims.Any()) { await entity.EditAsync(claims.Select(c => new EntityEditEntry(nameof(entity.Claims), c)), "Populate affiliations from zhwarriorswiki.", EntityEditOptions.Bot); } async Task <IList <(string AffId, string PosId)> > ExtractAffiliationsEx(Node afNode) { var rawAffiliations = afNode == null ? null : ExtractAffiliations(afNode); if (rawAffiliations == null || rawAffiliations.Count == 0) { return(new List <(string Name, string Position)>()); } async Task <(string, string)> SubTask(string aff, string book, string location) { var affid = CPRepository.EntityFromZhSiteLink(aff) ?? CPRepository.EntityFromLabel(aff); if (location != null && location.EndsWith("章")) { var pos = (await Site.SearchItemsAsync(book + "-" + location)).FirstOrDefault(); if (pos != null) { return(affid, pos); } } if (book != null) { if (!bookLocationCacheDict.TryGetValue(book, out var pos)) { pos = (await Site.SearchItemsAsync(book)).FirstOrDefault(); bookLocationCacheDict.TryAdd(book, pos); } if (pos != null) { return(affid, pos); } } return(affid, null); } var processed = await Task.WhenAll(rawAffiliations.Select(aff => SubTask(aff.Name, aff.Book, aff.Location))); return(processed); } } } }