public async Task ExportItemsAsync(TextReader itemsDumpReader, LuaModuleFactory moduleFactory)
    {
        if (itemsDumpReader == null)
        {
            throw new ArgumentNullException(nameof(itemsDumpReader));
        }
        if (moduleFactory == null)
        {
            throw new ArgumentNullException(nameof(moduleFactory));
        }
        var languages = new List <string>(Languages ?? defaultLanguages);
        int items = 0, properties = 0;
        var statusReportSw = Stopwatch.StartNew();

        foreach (var entity in SerializableEntity.LoadAll(itemsDumpReader))
        {
            if (entity.Type == EntityType.Item)
            {
                items++;
            }
            else if (entity.Type == EntityType.Property)
            {
                properties++;
            }

            // Preprocess
            entity.Labels       = FilterMonolingualTexts(entity.Labels, languages);
            entity.Descriptions = FilterMonolingualTexts(entity.Descriptions, languages);
            entity.Aliases      = FilterMonolingualTexts(entity.Aliases, languages);

            // Persist
            using (var module = moduleFactory.GetModule(entity.Id))
            {
                using (var writer = module.Writer)
                {
                    WriteProlog(writer, $"Entity: {entity.Id} ({entity.Labels["en"]})");
                    using (var luawriter = new JsonLuaWriter(writer)
                    {
                        CloseOutput = false
                    })
                    {
                        entity.WriteTo(luawriter);
                    }

                    WriteEpilog(writer);
                }

                await module.SubmitAsync($"Export entity {entity.Id}.");
            }

            if (statusReportSw.Elapsed > StatusReportInterval)
            {
                statusReportSw.Restart();
                Logger.Information("Exported LUA modules for {Items} items and {Properties} properties.", items, properties);
            }
        }
        Logger.Information("Exported LUA modules for {Items} items and {Properties} properties.", items, properties);
    }
    public async Task ExportSiteLinksAsync(TextReader itemsDumpReader, LuaModuleFactory moduleFactory, int shardCount)
    {
        if (itemsDumpReader == null)
        {
            throw new ArgumentNullException(nameof(itemsDumpReader));
        }
        if (moduleFactory == null)
        {
            throw new ArgumentNullException(nameof(moduleFactory));
        }
        if (shardCount <= 0)
        {
            throw new ArgumentOutOfRangeException(nameof(shardCount));
        }
        if (ClientSiteName == null)
        {
            throw new ArgumentNullException(nameof(ClientSiteName));
        }

        var shards = Enumerable.Range(0, shardCount).Select(index =>
        {
            var module = moduleFactory.GetModule(index.ToString());
            WriteProlog(module.Writer, $"Shard: {index + 1}/{shardCount}");
            return(module);
        }).ToList();
        var shardLuaWriters = shards.Select(m =>
                                            new LuaTableTextWriter(m.Writer)
        {
            CloseWriter = false, Formatting = Formatting.Prettified
        })
                              .ToList();

        foreach (var writer in shardLuaWriters)
        {
            writer.WriteStartTable();
        }
        try
        {
            foreach (var entity in SerializableEntity.LoadAll(itemsDumpReader))
            {
                var siteLink = entity.SiteLinks.FirstOrDefault(l => l.Site == ClientSiteName);
                if (siteLink == null)
                {
                    continue;
                }
                var shardIndex = Utility.HashString(siteLink.Title) % shardCount;
                var writer     = shardLuaWriters[shardIndex];
                writer.WriteKey(siteLink.Title);
                writer.WriteLiteral(entity.Id);
            }

            Logger.Information("Exporting LUA modules. Shards = {Shards}", shards.Count);
            for (var i = 0; i < shards.Count; i++)
            {
                shardLuaWriters[i].WriteEndTable();
                shardLuaWriters[i].Close();
                WriteEpilog(shards[i].Writer);
                await shards[i].SubmitAsync($"Export SiteLink table. Shard {i + 1}/{shards.Count}.");
            }
        }
        finally
        {
            foreach (var s in shards)
            {
                s.Dispose();
            }
        }
    }
Beispiel #3
0
    public async Task ExportModulesAsync()
    {
        if (SiteConfig == null)
        {
            throw new ArgumentNullException(nameof(SiteConfig));
        }
        var clusterDict  = new ConcurrentDictionary <string, AotSparqlModuleRoot>();
        var queryCounter = 0;

        foreach (var(queryName, queryContent) in SiteConfig.Queries)
        {
            queryCounter++;
            Logger.Information("Processing query {Counter}/{Total}: {Name}.",
                               queryCounter, SiteConfig.Queries.Count, queryName);
            try
            {
                // Execute params query first.
                var paramsResult    = executor.Execute(queryContent.ParamsQuery);
                var clusterVariable = SparqlQuery.ParamsQueryParamPrefix + queryContent.ClusteredBy;
                if (!paramsResult.Variables.Contains(clusterVariable))
                {
                    throw new InvalidOperationException(
                              $"Specified clustering variable {clusterVariable} does not exist in the result set from ParamsQuery.");
                }
                Logger.Information("Parameter query returned {Count} results.", paramsResult.Count);
                if (paramsResult.Count == 0)
                {
                    continue;
                }
                var paramNames = paramsResult.Variables
                                 .Where(v => v.StartsWith(SparqlQuery.ParamsQueryParamPrefix))
                                 .Select(v => (ParamName: v.Substring(SparqlQuery.ParamsQueryParamPrefix.Length), ResultName: v))
                                 .ToList();
                var pivotParamNames = paramNames.Where(p => p.ParamName != queryContent.ClusteredBy).ToList();
                var resultVariables = executor.GetResultVariables(queryContent.SourceQuery, paramNames.Select(n => n.ParamName));
                int minResultRows = int.MaxValue, maxResultRows = -1;
                var paramSetCounter = 0;
                // Traverse the params query result set.
                foreach (var row in paramsResult)
                {
                    paramSetCounter++;
                    Logger.Verbose("Processing param set {Counter}/{Total}.", paramSetCounter, paramsResult.Count);
                    var clusterNode = row[clusterVariable];
                    var clusterKey  = SerializeClusterKey(clusterNode);
                    var queryParams = paramNames.ToDictionary(p => p.ParamName, p => row.Value(p.ResultName));
                    var cluster     = clusterDict.GetOrAdd(clusterKey,
                                                           k => new AotSparqlModuleRoot
                    {
                        ResultSets = new SortedDictionary <string, AotSparqlQueryResultSet>()
                    });
                    AotSparqlQueryResultSet resultSet;
                    lock (cluster)
                    {
                        if (!cluster.ResultSets.TryGetValue(queryName, out resultSet))
                        {
                            resultSet = new AotSparqlQueryResultSet {
                                Results = new List <AotSparqlQueryResult>()
                            };
                            if (pivotParamNames.Count > 0)
                            {
                                resultSet.PivotParams = pivotParamNames.Select(p => p.ParamName).ToList();
                            }
                            resultSet.Columns = resultVariables;
                            cluster.ResultSets.Add(queryName, resultSet);
                        }
                    }
                    var queryResult = executor.ExecuteAndSerialize(queryContent.SourceQuery, resultVariables, queryParams);
                    minResultRows = Math.Min(minResultRows, queryResult.Rows.Count);
                    maxResultRows = Math.Max(maxResultRows, queryResult.Rows.Count);
                    lock (cluster)
                    {
                        if (pivotParamNames.Count > 0)
                        {
                            queryResult.PivotValues = pivotParamNames
                                                      .Select(p => executor.SerializeNode(queryParams[p.ParamName]))
                                                      .ToList();
                        }
                        resultSet.Results.Add(queryResult);
                    }
                }
                if (maxResultRows < 0)
                {
                    Debug.Assert(minResultRows == int.MaxValue);
                    minResultRows = -1;
                }
                Logger.Information("Executed query {QueryName} with {ParamRows} param sets. Source query results: min: {MinResults}, max: {MaxResults}.",
                                   queryName, paramsResult.Count, minResultRows, maxResultRows);
            }
            catch (Exception ex)
            {
                Logger.Error(ex, "Failed to execute query {QueryName}.", queryName);
                throw;
            }
        }
        Logger.Information("Normalize clustered modules…");
        // Sort result rows to ensure we only update modules when we need to.
        foreach (var root in clusterDict.Values)
        {
            foreach (var resultSet in root.ResultSets.Values)
            {
                var results = (List <AotSparqlQueryResult>)resultSet.Results;
                results.Sort((x, y) => SequenceComparer <object> .Default.Compare(x.PivotValues, y.PivotValues));
                foreach (var result in results)
                {
                    var rows = (List <IList <object> >)result.Rows;
                    rows.Sort(SequenceComparer <object> .Default);
                }
            }
        }
        Logger.Information("Writing {Count} clustered modules…", clusterDict.Count);
        var statusReportSw = Stopwatch.StartNew();
        var writtenCount   = 0;

        foreach (var(name, root) in clusterDict)
        {
            using (var module = moduleFactory.GetModule(name))
            {
                WriteProlog(module.Writer, "Cluster key: " + name);
                using (var jwriter = new JsonLuaWriter(module.Writer)
                {
                    CloseOutput = false
                })
                {
                    luaModuleJsonSerializer.Serialize(jwriter, root);
                }
                WriteEpilog(module.Writer);
                await module.SubmitAsync("Export clustered SPARQL query result for " + name + ".");

                writtenCount++;
                if (statusReportSw.Elapsed >= StatusReportInterval)
                {
                    Logger.Information("Written {Count}/{Total} clustered modules.", writtenCount, clusterDict.Count);
                }
                statusReportSw.Restart();
            }
        }
        Logger.Information("Written {Count}/{Total} clustered modules.", writtenCount, clusterDict.Count);
    }