public async Task ExportItemsAsync(TextReader itemsDumpReader, LuaModuleFactory moduleFactory) { if (itemsDumpReader == null) { throw new ArgumentNullException(nameof(itemsDumpReader)); } if (moduleFactory == null) { throw new ArgumentNullException(nameof(moduleFactory)); } var languages = new List <string>(Languages ?? defaultLanguages); int items = 0, properties = 0; var statusReportSw = Stopwatch.StartNew(); foreach (var entity in SerializableEntity.LoadAll(itemsDumpReader)) { if (entity.Type == EntityType.Item) { items++; } else if (entity.Type == EntityType.Property) { properties++; } // Preprocess entity.Labels = FilterMonolingualTexts(entity.Labels, languages); entity.Descriptions = FilterMonolingualTexts(entity.Descriptions, languages); entity.Aliases = FilterMonolingualTexts(entity.Aliases, languages); // Persist using (var module = moduleFactory.GetModule(entity.Id)) { using (var writer = module.Writer) { WriteProlog(writer, $"Entity: {entity.Id} ({entity.Labels["en"]})"); using (var luawriter = new JsonLuaWriter(writer) { CloseOutput = false }) { entity.WriteTo(luawriter); } WriteEpilog(writer); } await module.SubmitAsync($"Export entity {entity.Id}."); } if (statusReportSw.Elapsed > StatusReportInterval) { statusReportSw.Restart(); Logger.Information("Exported LUA modules for {Items} items and {Properties} properties.", items, properties); } } Logger.Information("Exported LUA modules for {Items} items and {Properties} properties.", items, properties); }
public async Task ExportSiteLinksAsync(TextReader itemsDumpReader, LuaModuleFactory moduleFactory, int shardCount) { if (itemsDumpReader == null) { throw new ArgumentNullException(nameof(itemsDumpReader)); } if (moduleFactory == null) { throw new ArgumentNullException(nameof(moduleFactory)); } if (shardCount <= 0) { throw new ArgumentOutOfRangeException(nameof(shardCount)); } if (ClientSiteName == null) { throw new ArgumentNullException(nameof(ClientSiteName)); } var shards = Enumerable.Range(0, shardCount).Select(index => { var module = moduleFactory.GetModule(index.ToString()); WriteProlog(module.Writer, $"Shard: {index + 1}/{shardCount}"); return(module); }).ToList(); var shardLuaWriters = shards.Select(m => new LuaTableTextWriter(m.Writer) { CloseWriter = false, Formatting = Formatting.Prettified }) .ToList(); foreach (var writer in shardLuaWriters) { writer.WriteStartTable(); } try { foreach (var entity in SerializableEntity.LoadAll(itemsDumpReader)) { var siteLink = entity.SiteLinks.FirstOrDefault(l => l.Site == ClientSiteName); if (siteLink == null) { continue; } var shardIndex = Utility.HashString(siteLink.Title) % shardCount; var writer = shardLuaWriters[shardIndex]; writer.WriteKey(siteLink.Title); writer.WriteLiteral(entity.Id); } Logger.Information("Exporting LUA modules. Shards = {Shards}", shards.Count); for (var i = 0; i < shards.Count; i++) { shardLuaWriters[i].WriteEndTable(); shardLuaWriters[i].Close(); WriteEpilog(shards[i].Writer); await shards[i].SubmitAsync($"Export SiteLink table. Shard {i + 1}/{shards.Count}."); } } finally { foreach (var s in shards) { s.Dispose(); } } }
public async Task ExportModulesAsync() { if (SiteConfig == null) { throw new ArgumentNullException(nameof(SiteConfig)); } var clusterDict = new ConcurrentDictionary <string, AotSparqlModuleRoot>(); var queryCounter = 0; foreach (var(queryName, queryContent) in SiteConfig.Queries) { queryCounter++; Logger.Information("Processing query {Counter}/{Total}: {Name}.", queryCounter, SiteConfig.Queries.Count, queryName); try { // Execute params query first. var paramsResult = executor.Execute(queryContent.ParamsQuery); var clusterVariable = SparqlQuery.ParamsQueryParamPrefix + queryContent.ClusteredBy; if (!paramsResult.Variables.Contains(clusterVariable)) { throw new InvalidOperationException( $"Specified clustering variable {clusterVariable} does not exist in the result set from ParamsQuery."); } Logger.Information("Parameter query returned {Count} results.", paramsResult.Count); if (paramsResult.Count == 0) { continue; } var paramNames = paramsResult.Variables .Where(v => v.StartsWith(SparqlQuery.ParamsQueryParamPrefix)) .Select(v => (ParamName: v.Substring(SparqlQuery.ParamsQueryParamPrefix.Length), ResultName: v)) .ToList(); var pivotParamNames = paramNames.Where(p => p.ParamName != queryContent.ClusteredBy).ToList(); var resultVariables = executor.GetResultVariables(queryContent.SourceQuery, paramNames.Select(n => n.ParamName)); int minResultRows = int.MaxValue, maxResultRows = -1; var paramSetCounter = 0; // Traverse the params query result set. foreach (var row in paramsResult) { paramSetCounter++; Logger.Verbose("Processing param set {Counter}/{Total}.", paramSetCounter, paramsResult.Count); var clusterNode = row[clusterVariable]; var clusterKey = SerializeClusterKey(clusterNode); var queryParams = paramNames.ToDictionary(p => p.ParamName, p => row.Value(p.ResultName)); var cluster = clusterDict.GetOrAdd(clusterKey, k => new AotSparqlModuleRoot { ResultSets = new SortedDictionary <string, AotSparqlQueryResultSet>() }); AotSparqlQueryResultSet resultSet; lock (cluster) { if (!cluster.ResultSets.TryGetValue(queryName, out resultSet)) { resultSet = new AotSparqlQueryResultSet { Results = new List <AotSparqlQueryResult>() }; if (pivotParamNames.Count > 0) { resultSet.PivotParams = pivotParamNames.Select(p => p.ParamName).ToList(); } resultSet.Columns = resultVariables; cluster.ResultSets.Add(queryName, resultSet); } } var queryResult = executor.ExecuteAndSerialize(queryContent.SourceQuery, resultVariables, queryParams); minResultRows = Math.Min(minResultRows, queryResult.Rows.Count); maxResultRows = Math.Max(maxResultRows, queryResult.Rows.Count); lock (cluster) { if (pivotParamNames.Count > 0) { queryResult.PivotValues = pivotParamNames .Select(p => executor.SerializeNode(queryParams[p.ParamName])) .ToList(); } resultSet.Results.Add(queryResult); } } if (maxResultRows < 0) { Debug.Assert(minResultRows == int.MaxValue); minResultRows = -1; } Logger.Information("Executed query {QueryName} with {ParamRows} param sets. Source query results: min: {MinResults}, max: {MaxResults}.", queryName, paramsResult.Count, minResultRows, maxResultRows); } catch (Exception ex) { Logger.Error(ex, "Failed to execute query {QueryName}.", queryName); throw; } } Logger.Information("Normalize clustered modules…"); // Sort result rows to ensure we only update modules when we need to. foreach (var root in clusterDict.Values) { foreach (var resultSet in root.ResultSets.Values) { var results = (List <AotSparqlQueryResult>)resultSet.Results; results.Sort((x, y) => SequenceComparer <object> .Default.Compare(x.PivotValues, y.PivotValues)); foreach (var result in results) { var rows = (List <IList <object> >)result.Rows; rows.Sort(SequenceComparer <object> .Default); } } } Logger.Information("Writing {Count} clustered modules…", clusterDict.Count); var statusReportSw = Stopwatch.StartNew(); var writtenCount = 0; foreach (var(name, root) in clusterDict) { using (var module = moduleFactory.GetModule(name)) { WriteProlog(module.Writer, "Cluster key: " + name); using (var jwriter = new JsonLuaWriter(module.Writer) { CloseOutput = false }) { luaModuleJsonSerializer.Serialize(jwriter, root); } WriteEpilog(module.Writer); await module.SubmitAsync("Export clustered SPARQL query result for " + name + "."); writtenCount++; if (statusReportSw.Elapsed >= StatusReportInterval) { Logger.Information("Written {Count}/{Total} clustered modules.", writtenCount, clusterDict.Count); } statusReportSw.Restart(); } } Logger.Information("Written {Count}/{Total} clustered modules.", writtenCount, clusterDict.Count); }