Exemplo n.º 1
0
        private static async Task Main(string[] args)
        {
            var mongoClient = new MongoClient(MongoDbConnString);
            var storage     = new MongoWikiDownloadStorage(mongoClient);

            var titles = await storage.GetAllPageTitlesAsync();

            var nodeKeysInfo = NodeKeysInfoCalculator.Build(titles);

            File.WriteAllText(NodeKeysInfoOutputFileName, nodeKeysInfo.ToJson());

            var edges = new Dictionary <string, string[]>();

            foreach (var title in nodeKeysInfo.Regular)
            {
                var content = await storage.GetPageContent(title);

                edges[title] = ExtractReferences(content)
                               .Select(x => MapReference(nodeKeysInfo, x))
                               .Where(x => x != null)
                               .Distinct()
                               .ToArray();
            }

            File.WriteAllText(EdgesOutputFileName, edges.ToJson());
        }
Exemplo n.º 2
0
        private static async Task Main()
        {
            var mongoClient = new MongoClient(MongoDbConnString);
            var storage     = new MongoWikiDownloadStorage(mongoClient);

            var links = await File.ReadAllLinesAsync(ArticleListFileName);

            var titles = await storage.GetAllPageTitlesAsync();

            var titlesDict = titles.ToDictionary(x => x.Name);

            var refs   = new List <string>();
            var errors = new List <string>();
            var pages  = new Dictionary <string, string>();

            foreach (var link in links)
            {
                var name = link.Replace("_", " ");

                if (!titlesDict.ContainsKey(name))
                {
                    errors.Add(link);

                    continue;
                }

                var title = titlesDict[name];

                var content = await storage.GetPageContent(title.ReferenceName ?? title.Name);

                if (content == null)
                {
                    errors.Add(link);

                    continue;
                }

                var document = new HtmlDocument();
                document.LoadHtml(content);

                pages[link] = document.DocumentNode.InnerText;
            }

            var extractInfo = new { errors, refs };

            await File.WriteAllTextAsync(ExtractInfoOutputFileName, JsonConvert.SerializeObject(extractInfo));

            await File.WriteAllTextAsync(ExtractedPagesOutputFileName, JsonConvert.SerializeObject(pages));
        }
Exemplo n.º 3
0
        internal static async Task Main(string[] args)
        {
            var httpClient      = new HttpClient();
            var wikiApiProvider = new WikiApiProvider(httpClient, new Uri(SimpleApiUrl));

            var mongoClient = new MongoClient(MongoDbConnString);
            var storage     = new MongoWikiDownloadStorage(mongoClient);

            var apContinue = await storage.GetProperty <string>(ApContinuePropName);

            while (apContinue != string.Empty)
            {
                var(pages, newApContinue) = await wikiApiProvider.GetAllPagesBatch(apContinue, null);

                apContinue = newApContinue ?? string.Empty;

                await storage.AddPageTitlesAsync(pages);

                await storage.SetProperty(ApContinuePropName, newApContinue);
            }

            await storage.SetProperty(ApContinuePropName, string.Empty);
        }