Example #1
0
        public List <Link> EvalLinks(NodeContent nodeContent)
        {
            List <Link> results = new List <Link>();

            if (!nodeContent.ContentType.StartsWith("text/html"))
            {
                return(results);
            }

            try
            {
                foreach (Match anchor in anchorsRegex.Matches(nodeContent.Content))
                {
                    string url = anchor.Value;
                    // TODO: this should be done not only with double quotes
                    if (!url.Contains("href=\""))
                    {
                        continue;
                    }
                    url = url.Substring(url.IndexOf("href=\"") + 6);
                    if (url.Contains("\""))
                    {
                        url = url.Substring(0, url.IndexOf("\""));
                    }
                    if (url.Contains("#"))
                    {
                        url = url.Substring(0, url.IndexOf("#"));
                    }
                    Uri uri = null;
                    try
                    {
                        uri = new Uri(url);
                    }
                    catch (UriFormatException)
                    {
                        // Try to construct a full url, in case what we have is a relative url
                        url = nodeContent.Node.Uri.ToString() + "/" + url;
                        // Remove excess slashes
                        url = url.Replace("///", "/");
                        while (url.Substring(url.IndexOf("://") + 3).Contains("//"))
                        {
                            url = url.Substring(0, url.IndexOf("://") + 3) + url.Substring(url.IndexOf("://") + 3).Replace("//", "/");
                        }
                        try
                        {
                            uri = new Uri(url);
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                    }
                    catch (Exception ex)
                    {
                        myCrawlListener.ExceptionRaised(this, ex);
                    }
                    if (uri != null)
                    {
                        results.Add(new Link(uri));
                    }
                }
            }
            catch (Exception ex)
            {
                myCrawlListener.ExceptionRaised(this, ex);
            }
            return(results);
        }
Example #2
0
        private string ReadUri(Uri uri, out HttpStatusCode status, out string contentType, out long contentLength)
        {
            Stream       data   = null;
            StreamReader reader = null;
            string       s      = string.Empty;

            status        = HttpStatusCode.NoContent;
            contentType   = string.Empty;
            contentLength = 0;

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(uri);

            request.Headers.Add("user-agent", userAgent);

            try
            {
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                status        = response.StatusCode;
                contentType   = response.ContentType;
                contentLength = response.ContentLength;

                if (status == HttpStatusCode.OK)
                {
                    if (contentType.StartsWith("text"))
                    {
                        data = response.GetResponseStream();
                        try
                        {
                            reader = new StreamReader(data);
                            s      = reader.ReadToEnd();
                        }
                        finally
                        {
                            if (reader != null)
                            {
                                reader.Close();
                            }
                        }
                    }
                }
            }
            catch (WebException ex)
            {
                var errorResponse = ex.Response as HttpWebResponse;
                if (errorResponse != null)
                {
                    status = errorResponse.StatusCode;
                }
                myCrawlListener.ExceptionRaised(this, ex);
            }
            catch (Exception ex)
            {
                myCrawlListener.ExceptionRaised(this, ex);
            }
            finally
            {
                if (data != null)
                {
                    data.Close();
                }
            }
            return(s);
        }