Esempio n. 1
0
        private bool PageIsHtml(string pageAddress, ref LinkInfo li)
        {
            HttpWebResponse resp = null;
            bool isHtml = false;
            const string TypeHTML = "text/html";

            li.StatusCode = (HttpStatusCode)(-2);  // not html

            try
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageAddress);

                if (this.noProxy)
                {
                    req.Proxy = new WebProxy();
                }

                req.Method = "HEAD";

                resp = (HttpWebResponse)req.GetResponse();

                WebHeaderCollection headers = resp.Headers;

                string contentType = headers["Content-type"];
                if (contentType != null)
                {
                    contentType = contentType.ToLower(CultureInfo.InvariantCulture);
                    if (contentType.StartsWith(TypeHTML))
                    {
                        isHtml = true;
                    }

                    if(contentType.IndexOf(";")>=0)
                    {
                        MatchCollection tagMatches = Regex.Matches(contentType,
                                                                "charset=(.+)$",
                                                                RegexOptions.IgnoreCase);
                        foreach (Match m in tagMatches)
                        {
                            if (m.Groups.Count>0 && m.Groups[1].Success)
                            {
                                string charset = m.Groups[1].Captures[0].ToString();
                                li.Charset = charset;
                                break;
                            }
                        }
                    }

                    // 해당주소의 컨텐츠타입
                    li.ContentType = contentType;
                }

                li.StatusCode = resp.StatusCode;
            }
            catch (WebException e)
            {
                string str = string.Format(CultureInfo.CurrentCulture,
                                        "Caught WebException: {0}",
                                        e.Status.ToString()); ;

                resp = (HttpWebResponse)e.Response;
                if (null != resp)
                {
                    li.StatusCode = resp.StatusCode;
                    str = string.Format(CultureInfo.CurrentCulture,
                                        "{0} ({1})",
                                        str, li.StatusCode);
                }
                else
                {
                    li.StatusCode = (HttpStatusCode)(-1);
                }

                if (CurrentPageEvent != null)
                {
                    CurrentPageEvent(this, new CurrentPageEventArgs(str));
                }
            }
            catch (NotSupportedException)
            {
                li.StatusCode = (HttpStatusCode)(-1);
            }
            finally
            {
                if (null != resp)
                {
                    resp.Close();
                }
            }

            return isHtml;
        }
Esempio n. 2
0
 public CurrentPageEventArgs(LinkInfo linkInfo)
 {
     this.linkInfo = linkInfo;
 }
Esempio n. 3
0
        private HttpStatusCode GetPageData(ref Uri pageUri,
            out string pageData,
            LinkInfo linkInfo)
        {
            HttpStatusCode status = (HttpStatusCode)0;
            HttpWebResponse resp = null;

            pageData = "";

            try
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageUri);

                if (this.noProxy)
                {
                    req.Proxy = new WebProxy();
                }

                resp = (HttpWebResponse)req.GetResponse();
                pageUri = resp.ResponseUri;

                StreamReader sr = new StreamReader(resp.GetResponseStream());
                pageData = sr.ReadToEnd();
                sr.Close();

                status = resp.StatusCode;

                if (CurrentPageEvent != null)
                {
                    linkInfo.StatusCode = status;
                    linkInfo.Html = pageData;
                    CurrentPageEvent(this, new CurrentPageEventArgs(linkInfo));
                }
            }
            catch (WebException e)
            {
                string str = string.Format(CultureInfo.CurrentCulture,
                                        "Caught WebException: {0}",
                                        e.Status.ToString()); ;

                resp = (HttpWebResponse)e.Response;
                if (null != resp)
                {
                    status = resp.StatusCode;
                    str = string.Format(CultureInfo.CurrentCulture,
                                        "{0} ({1})",
                                        str, status);
                }
                else
                {
                    status = (HttpStatusCode)(-1);
                }

                if (CurrentPageEvent != null)
                {
                    CurrentPageEvent(this, new CurrentPageEventArgs(str));
                }
            }
            finally
            {
                if (null != resp)
                {
                    resp.Close();
                }
            }

            return status;
        }
Esempio n. 4
0
        private bool PageIsHtml(string pageAddress, ref LinkInfo li)
        {
            HttpWebResponse resp     = null;
            bool            isHtml   = false;
            const string    TypeHTML = "text/html";

            li.StatusCode = (HttpStatusCode)(-2);  // not html

            try
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageAddress);

                if (this.noProxy)
                {
                    req.Proxy = new WebProxy();
                }

                req.Method = "HEAD";

                resp = (HttpWebResponse)req.GetResponse();

                WebHeaderCollection headers = resp.Headers;

                string contentType = headers["Content-type"];
                if (contentType != null)
                {
                    contentType = contentType.ToLower(CultureInfo.InvariantCulture);
                    if (contentType.StartsWith(TypeHTML))
                    {
                        isHtml = true;
                    }

                    if (contentType.IndexOf(";") >= 0)
                    {
                        MatchCollection tagMatches = Regex.Matches(contentType,
                                                                   "charset=(.+)$",
                                                                   RegexOptions.IgnoreCase);
                        foreach (Match m in tagMatches)
                        {
                            if (m.Groups.Count > 0 && m.Groups[1].Success)
                            {
                                string charset = m.Groups[1].Captures[0].ToString();
                                li.Charset = charset;
                                break;
                            }
                        }
                    }

                    // 해당주소의 컨텐츠타입

                    li.ContentType = contentType;
                }

                li.StatusCode = resp.StatusCode;
            }
            catch (WebException e)
            {
                string str = string.Format(CultureInfo.CurrentCulture,
                                           "Caught WebException: {0}",
                                           e.Status.ToString());;

                resp = (HttpWebResponse)e.Response;
                if (null != resp)
                {
                    li.StatusCode = resp.StatusCode;
                    str           = string.Format(CultureInfo.CurrentCulture,
                                                  "{0} ({1})",
                                                  str, li.StatusCode);
                }
                else
                {
                    li.StatusCode = (HttpStatusCode)(-1);
                }

                if (CurrentPageEvent != null)
                {
                    CurrentPageEvent(this, new CurrentPageEventArgs(str));
                }
            }
            catch (NotSupportedException)
            {
                li.StatusCode = (HttpStatusCode)(-1);
            }
            finally
            {
                if (null != resp)
                {
                    resp.Close();
                }
            }

            return(isHtml);
        }
Esempio n. 5
0
 public CurrentPageEventArgs(LinkInfo linkInfo)
 {
     this.linkInfo = linkInfo;
 }
Esempio n. 6
0
        private void Crawl()
        {
            Hashtable links = null;

            try
            {
                links = new Hashtable(StringComparer.InvariantCultureIgnoreCase);

                if (-1 == this.startingPage.IndexOf("://"))
                {
                    this.startingPage = string.Format(CultureInfo.InvariantCulture,
                                                      "http://{0}",
                                                      this.startingPage);
                }

                int currentDepth = 1;
                links.Add(this.startingPage, new LinkInfo(this.startingPage,
                                                          (HttpStatusCode)0));
                while (!this.stopNow)
                {
                    Hashtable found =
                        new Hashtable(StringComparer.InvariantCultureIgnoreCase);

                    #region 링크추출을 일련작업
                    foreach (string page in links.Keys)
                    {
                        if (!this.isOutboundSearch && page.IndexOf(startingPage) < 0)
                        {
                            continue;
                        }

                        if (this.stopNow)
                        {
                            continue;
                        }

                        if (CurrentPageEvent != null)
                        {
                            CurrentPageEvent(this,
                                             new CurrentPageEventArgs(page));
                        }

                        #region 링크추출
                        LinkInfo li = (LinkInfo)links[page];
                        try
                        {
                            HttpStatusCode currentStatus = li.StatusCode;

                            if (((HttpStatusCode)0 == currentStatus) && PageIsHtml(page, ref li))
                            {
                                Uri    pageUri  = new Uri(page);
                                string pageData = "";
                                currentStatus = GetPageData(ref pageUri, out pageData, li);

                                if (HttpStatusCode.OK == currentStatus)
                                {
                                    #region 링크추출
                                    // <a href=
                                    GetPageLinks(pageUri,
                                                 pageData,
                                                 "a",
                                                 "href",
                                                 found);
                                    // <frame src=
                                    GetPageLinks(pageUri,
                                                 pageData,
                                                 "frame",
                                                 "src",
                                                 found);
                                    // <area href=
                                    GetPageLinks(pageUri,
                                                 pageData,
                                                 "area",
                                                 "href",
                                                 found);
                                    // <link href=
                                    GetPageLinks(pageUri,
                                                 pageData,
                                                 "link",
                                                 "href",
                                                 found);
                                    #endregion
                                }
                            }
                            else
                            {
                            }
                            li.StatusCode = currentStatus;
                        }
                        catch (UriFormatException)
                        {
                            #region Error
                            li.StatusCode = (HttpStatusCode)(-1);

                            if (CurrentPageEvent != null)
                            {
                                String message = String.Format(
                                    CultureInfo.CurrentCulture,
                                    "Unable to crawl {0} (UriFormatException)",
                                    page);
                                CurrentPageEvent(this,
                                                 new CurrentPageEventArgs(message));
                            }
                            #endregion
                        }
                        #endregion
                    }  // foreach
                    #endregion

                    // 링크가 없다면

                    if (0 == found.Count)
                    {
                        // 멈춤
                        lock (this)
                        {
                            this.stopNow = true;
                        }
                        continue;
                    }

                    foreach (string page in found.Keys)
                    {
                        if (!links.ContainsKey(page))
                        {
                            LinkInfo lk = (LinkInfo)found[page];
                            lk.DirDepth = currentDepth;
                            links.Add(page, lk);
                        }
                    }

                    // 깊이 +1, 초과면 중지
                    if (this.maxDepth < currentDepth)
                    {
                        lock (this)
                        {
                            this.stopNow = true;
                        }
                        continue;
                    }
                    currentDepth += 1;
                }
            }
            catch (OutOfMemoryException)
            {
                links = null;

                lock (this)
                {
                    this.stopNow = true;
                }

                if (null != CurrentPageEvent)
                {
                    CurrentPageEvent(this,
                                     new CurrentPageEventArgs(
                                         "Crawl halted: out of memory"));
                }
            }
            catch (Exception e)
            {
                lock (this)
                {
                    this.stopNow = true;
                }

                if (null != CurrentPageEvent)
                {
                    string message = string.Format(CultureInfo.CurrentCulture,
                                                   "Crawl halted: {0} - {1}",
                                                   e.ToString(),
                                                   e.Message);
                    CurrentPageEvent(this,
                                     new CurrentPageEventArgs(message));
                }
            }

            if (null != CrawlFinishedEvent)
            {
                CrawlFinishedEvent(this, EventArgs.Empty);
            }
        }
Esempio n. 7
0
        private HttpStatusCode GetPageData(ref Uri pageUri,
                                           out string pageData,
                                           LinkInfo linkInfo)
        {
            HttpStatusCode  status = (HttpStatusCode)0;
            HttpWebResponse resp   = null;

            pageData = "";

            try
            {
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(pageUri);

                if (this.noProxy)
                {
                    req.Proxy = new WebProxy();
                }

                resp    = (HttpWebResponse)req.GetResponse();
                pageUri = resp.ResponseUri;

                StreamReader sr = new StreamReader(resp.GetResponseStream());
                pageData = sr.ReadToEnd();
                sr.Close();

                status = resp.StatusCode;

                if (CurrentPageEvent != null)
                {
                    linkInfo.StatusCode = status;
                    linkInfo.Html       = pageData;
                    CurrentPageEvent(this, new CurrentPageEventArgs(linkInfo));
                }
            }
            catch (WebException e)
            {
                string str = string.Format(CultureInfo.CurrentCulture,
                                           "Caught WebException: {0}",
                                           e.Status.ToString());;

                resp = (HttpWebResponse)e.Response;
                if (null != resp)
                {
                    status = resp.StatusCode;
                    str    = string.Format(CultureInfo.CurrentCulture,
                                           "{0} ({1})",
                                           str, status);
                }
                else
                {
                    status = (HttpStatusCode)(-1);
                }

                if (CurrentPageEvent != null)
                {
                    CurrentPageEvent(this, new CurrentPageEventArgs(str));
                }
            }
            finally
            {
                if (null != resp)
                {
                    resp.Close();
                }
            }

            return(status);
        }