static void Main(string[] args) { //var directoryName = Environment.ExpandEnvironmentVariables(@"%USERPROFILE%\personal\gazeta\sovsib"); var directoryName = @"D:\gazeta\sovsib"; var webCrawler = new WebCrawler(directoryName); const String baseUrl = "http://elib.ngonb.ru"; var years = webCrawler.ExtractAll("http://elib.ngonb.ru/jspui/handle/NGONB/32", @"<option value=""NGONB/(\d+)"">\d\d\d\d</option>"); foreach (var year in years) { var url = String.Format("http://elib.ngonb.ru/jspui/handle/NGONB/{0}/browse?type=dateissued&submit_browse=Issue+Date", year); while (true) { var issues = webCrawler.ExtractAll(url, @"<a href=""/jspui/handle/NGONB/(\d+)"">(.*?)</a></td>"); foreach (var issue in issues) { var issueUrl = "http://elib.ngonb.ru/jspui/handle/NGONB/" + issue; var pdfUrl = webCrawler.Extract(issueUrl, @"""(/jspui/bitstream/NGONB/" + issue + @"(?:.+?).pdf)"""); if (!String.IsNullOrEmpty(pdfUrl)) { pdfUrl = baseUrl + pdfUrl; var date = webCrawler.Extract(issueUrl, @">(\d\d\d\d-\d\d-\d\d)<"); var directory = date.Substring(0, 4); var number = webCrawler.Extract(issueUrl, @">(\d+).pdf<"); var fileName = String.Format(@"{0}\{1}_{2}.pdf", directory, date.Replace('-', '_'), number); Console.WriteLine("{0} => {1}", pdfUrl, fileName); webCrawler.AddFile(pdfUrl, fileName); } } // next page url = webCrawler.Extract(url, @"href=""(.+?)"">next"); if (String.IsNullOrEmpty(url)) { break; } url = baseUrl + url.XmlUnescape(); } } }
static void Main(string[] args) { var directoryName = Environment.ExpandEnvironmentVariables(@"%USERPROFILE%\personal\gazeta\krasev"); //var directoryName = @"D:\gazeta\krasev"; var webCrawler = new WebCrawler(directoryName); const String baseUrl = "http://www.booksite.ru/krassever/"; var years = webCrawler.ExtractAll(baseUrl + "index.htm", @" href=""(\d\d\d\d\..+?)"""); foreach (var year in years) { var yearUrl = baseUrl + year; var issues = webCrawler.ExtractAll(yearUrl, @"<a href=""(\d\d\d\d/(?:\w+/)?\d\d\d\d_\d+\.pdf)"); foreach (var issue in issues) { var pdfUrl = baseUrl + issue; var directory = webCrawler.Parse(pdfUrl, @"/(\d\d\d\d)/"); var number = webCrawler.Parse(pdfUrl, @"_(\d+)\.").PadLeft(3, '0'); var name = webCrawler.Parse(pdfUrl, @"/\d\d\d\d/(?:(\w+)/)\d\d\d\d_"); if (!String.IsNullOrEmpty(name)) { if (name.Equals("izvestya")) { name = "A_"; } else if (name.Equals("krassever")) { name = "B_"; } else { throw new Exception(); } } var fileName = String.Format(@"{0}\{0}_{2}{1}.pdf", directory, number, name); Console.WriteLine("{0} => {1}", pdfUrl, fileName); webCrawler.AddFile(pdfUrl, fileName); } } }