public List <ExtractResult> Extract(string url) { var cralwer = new RuiJi.Net.NodeVisitor.Crawler(); var response = cralwer.Request(url); var content = response.Data.ToString(); var results = new List <ExtractResult>(); var blocks = Feeder.GetExtractBlock(url); blocks.ForEach((m) => { var r = RuiJi.Net.NodeVisitor.Extracter.Extract(new ExtractRequest { Block = m, Content = content }); results.Add(r); }); return(results); }
private static void DownloadPage(Uri uri, ExtractResult result, string method, string ip, PageDownloadHandler handler, int maxRetry = 10) { handler(uri, result); var pages = new Dictionary <string, ExtractResult>(); pages.Add(uri.ToString(), result); var lines = String.Join("\n", result.Paging.Distinct()); var reader = new StringReader(lines); var crawler = new RuiJiCrawler(); var url = reader.ReadLine(); var diffBuilder = new InlineDiffBuilder(new Differ()); while (!string.IsNullOrEmpty(url)) { var u = new Uri(uri, url); if (pages.ContainsKey(u.ToString())) { url = reader.ReadLine(); continue; } var request = new Request(u); request.Method = method; if (!string.IsNullOrEmpty(ip)) { request.Ip = ip; } var response = Crawler.Request(request); var content = response.Data.ToString(); var blocks = Feeder.GetExtractBlock(u.ToString()); var er = new ExtractRequest { Blocks = blocks, Content = content }; var results = Extractor.Extract(er); var r = results.OrderByDescending(m => m.Metas.Count).FirstOrDefault(); if (r.Paging == null) { Thread.Sleep(3000); if (--maxRetry == 0) { break; } continue; } handler(uri, result); if (r.Paging != null && r.Paging.Count > 0) { var nlines = String.Join("\n", r.Paging.Distinct()); var diff = diffBuilder.BuildDiffModel(lines, nlines); nlines = string.Join("\n", diff.Lines.Select(m => m.Text)); reader = new StringReader(nlines); url = reader.ReadLine(); } } }