Example #1
0
        public void Crawl()
        {
            SpiderProcess.Seed2Queue(urlQueue, Setting);

            while (true)
            {
                Workers.WorkersFreeEvent.WaitOne();
                if (!urlQueue.HasValue)
                {
                    break;
                }
            }
        }
Example #2
0
        public void WorkerAction()
        {
            var result = urlQueue.DeQueue();

            if (result != null)
            {
                if (result.Depth < Setting.Depth)
                {
                    try
                    {
                        var pageData = SpiderProcess.UrlRead(result.Url, Setting);
                        OnDataReceivedEventHandler(this, new Event.DataReceivedEventArgs(result.Url));

                        Regex           r = new Regex(pattern, RegexOptions.IgnoreCase);
                        MatchCollection m = r.Matches(pageData);

                        foreach (var url in m)
                        {
                            var U = url.ToString();
                            if (SpiderProcess.UrlFilter(U, Setting))
                            {
                                continue;
                            }
                            if (SpiderProcess.UrlMatch(U, Setting))
                            {
                                OnAddUrlEventHandler(this, new Event.AddUrlEventArgs(U, result.Depth));
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        if (ErrorEventHandler != null)
                        {
                            OnErrorEventHandler(this, new Event.ErrorEventArgs(result.Url, e.Message));
                        }
                    }
                }
            }
        }
Example #3
0
        public void WorkerAction()
        {
            var result = urlQueue.DeQueue();

            if (result != null)
            {
                if (result.Depth < Setting.Depth)
                {
                    try
                    {
                        string pageData = null;
                        using (var dataStream = SpiderProcess.UrlReadData(result.Url, Setting))
                        {
                            const int buffSize = 40960;

                            var         buff  = new byte[buffSize];
                            int         bytes = 0;
                            List <byte> recv  = new List <byte>();
                            while ((bytes = dataStream.Read(buff, 0, buffSize)) != 0)
                            {
                                if (bytes != buffSize)
                                {
                                    var ToBuff = new byte[bytes];
                                    Array.Copy(buff, ToBuff, bytes);
                                    recv.AddRange(ToBuff);
                                }
                                else
                                {
                                    recv.AddRange(buff);
                                }
                            }

                            var data = recv.ToArray();
                            OnDataReceivedEventHandler(this, new Event.DataReceivedEventArgs(result.Url, data));
                            pageData = Encoding.UTF8.GetString(data);
                        }

                        Regex           r = new Regex(pattern, RegexOptions.IgnoreCase);
                        MatchCollection m = r.Matches(pageData);

                        foreach (var url in m)
                        {
                            var U = url.ToString();
                            if (SpiderProcess.UrlFilter(U, Setting))
                            {
                                continue;
                            }
                            if (SpiderProcess.UrlMatch(U, Setting))
                            {
                                OnAddUrlEventHandler(this, new Event.AddUrlEventArgs(U, result.Depth));
                            }
                        }
                    }
                    catch (Exception e)
                    {
                        if (ErrorEventHandler != null)
                        {
                            OnErrorEventHandler(this, new Event.ErrorEventArgs(result.Url, e.Message));
                        }
                    }
                }
            }
        }