protected virtual async Task ContinueCrawlerAsync(WebCrawler crawler, string lastCache, string basePath) { string lastArea = string.Empty; bool lastAreaExists = File.Exists(lastCache); if (lastAreaExists) { lastArea = File.ReadAllLines(lastCache)[0]; } int count = 0; while (++count > 0) { LogHelper.Info($"{crawler.GetType()}城市:" + count.ToString()); var path = Directory.GetCurrentDirectory() + string.Format(basePath, count.ToString()); if (File.Exists(path)) { var areas = File.ReadAllLines(path); await StartCrawlerAsync(crawler, areas, lastAreaExists, lastCache, lastArea); } else { LogHelper.Info($"{crawler.GetType()}爬虫结束"); File.Delete(lastCache); break; } } }
protected async Task StartCrawlerAsync(WebCrawler crawler, string[] areas, bool lastAreaExists, string lastCache, string lastArea) { if (lastAreaExists) { areas = areas.SkipWhile(area => area != lastArea).ToArray(); } LogHelper.Info($"剩余城市数量:{areas.Count()}"); foreach (var area in areas) { File.WriteAllText(lastCache, area); await crawler.AgentCrawlerAsync(new Uri(area)); } }
protected async Task WriteAreaUriToFileAsync(WebCrawler crawler, string city, string areaPath) { List <string> areaList = new List <string>(); var cityAreaUris = await crawler.GetCityAreaUrisAsync(city); foreach (var cityAreaUri in cityAreaUris) { var areas = await crawler.GetAreasAsync(cityAreaUri); foreach (var area in areas) { areaList.Add(area); } } if (areaList.Count > 0) { File.WriteAllLines(areaPath + $"/{index}.txt", areaList); } else { --index; } }