Ejemplo n.º 1
0
        public List <ExtractResult> Extract(string url)
        {
            var cralwer  = new RuiJi.Net.NodeVisitor.Crawler();
            var response = cralwer.Request(url);
            var content  = response.Data.ToString();

            var results = new List <ExtractResult>();

            var blocks = Feeder.GetExtractBlock(url);

            blocks.ForEach((m) => {
                var r = RuiJi.Net.NodeVisitor.Extracter.Extract(new ExtractRequest
                {
                    Block   = m,
                    Content = content
                });

                results.Add(r);
            });

            return(results);
        }
Ejemplo n.º 2
0
        private static void DownloadPage(Uri uri, ExtractResult result, string method, string ip, PageDownloadHandler handler, int maxRetry = 10)
        {
            handler(uri, result);

            var pages = new Dictionary <string, ExtractResult>();

            pages.Add(uri.ToString(), result);

            var lines  = String.Join("\n", result.Paging.Distinct());
            var reader = new StringReader(lines);

            var crawler = new RuiJiCrawler();

            var url = reader.ReadLine();

            var diffBuilder = new InlineDiffBuilder(new Differ());

            while (!string.IsNullOrEmpty(url))
            {
                var u = new Uri(uri, url);
                if (pages.ContainsKey(u.ToString()))
                {
                    url = reader.ReadLine();
                    continue;
                }

                var request = new Request(u);
                request.Method = method;

                if (!string.IsNullOrEmpty(ip))
                {
                    request.Ip = ip;
                }

                var response = Crawler.Request(request);
                var content  = response.Data.ToString();

                var blocks = Feeder.GetExtractBlock(u.ToString());
                var er     = new ExtractRequest
                {
                    Blocks  = blocks,
                    Content = content
                };

                var results = Extractor.Extract(er);

                var r = results.OrderByDescending(m => m.Metas.Count).FirstOrDefault();
                if (r.Paging == null)
                {
                    Thread.Sleep(3000);
                    if (--maxRetry == 0)
                    {
                        break;
                    }

                    continue;
                }

                handler(uri, result);

                if (r.Paging != null && r.Paging.Count > 0)
                {
                    var nlines = String.Join("\n", r.Paging.Distinct());
                    var diff   = diffBuilder.BuildDiffModel(lines, nlines);

                    nlines = string.Join("\n", diff.Lines.Select(m => m.Text));
                    reader = new StringReader(nlines);
                    url    = reader.ReadLine();
                }
            }
        }