示例#1
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsTextContent(propertyBag.ContentType))
            {
                return;
            }

            using (MemoryStream reader = propertyBag.GetResponseStream())
            {
                string content = reader.ReadToEnd();
                propertyBag.Text = content.Trim();
            }
        }
示例#2
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsTextContent(propertyBag.ContentType))
            {
                return;
            }

            using (MemoryStream reader = propertyBag.GetResponseStream())
            {
                string content = reader.ReadToEnd();
                propertyBag.Text = content.Trim();
            }
        }
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            string content = propertyBag.Text;

            if (content.IsNullOrEmpty())
            {
                return;
            }

            string contentLookupText = content.Length > MaxPostSize
                                ? content.Substring(0, MaxPostSize).Trim()
                                : content.Trim();

            string encodedRequestUrlFragment =
                "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q={0}".FormatWith(contentLookupText);

            IWebDownloader downloader = m_DownloaderFactory.GetDownloader();
            PropertyBag    result     = downloader.Download(new CrawlStep(new Uri(encodedRequestUrlFragment), 0), DownloadMethod.Get);

            using (MemoryStream responseReader = result.GetResponseStream())
                using (StreamReader reader = new StreamReader(responseReader))
                {
                    string json = reader.ReadLine();
                    using (MemoryStream ms = new MemoryStream(Encoding.Unicode.GetBytes(json)))
                    {
                        DataContractJsonSerializer ser =
                            new DataContractJsonSerializer(typeof(LanguageDetector));
                        LanguageDetector detector = ser.ReadObject(ms) as LanguageDetector;

                        if (!detector.IsNull())
                        {
                            CultureInfo culture = CultureInfo.GetCultureInfo(detector.responseData.language);
                            propertyBag["Language"].Value        = detector.responseData.language;
                            propertyBag["LanguageCulture"].Value = culture;
                        }
                    }
                }
        }
示例#4
0
        public void Process(Crawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return;
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return;
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (MemoryStream reader = propertyBag.GetResponseStream())
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(reader);
                reader.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(reader, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(reader, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select name.Value + ": " + content.Value).ToArray();
            }

            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, link },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                });
            }
        }
示例#5
0
        private void Initialize()
        {
            try
            {
                Uri         robotsUri = new Uri("http://{0}/robots.txt".FormatWith(m_StartPageUri.Host));
                PropertyBag robots    = m_WebDownloader.Download(new CrawlStep(robotsUri, 0), DownloadMethod.Get);

                if (robots.StatusCode != HttpStatusCode.OK)
                {
                    return;
                }

                string fileContents;
                using (StreamReader stream = new StreamReader(robots.GetResponseStream(), Encoding.ASCII))
                {
                    fileContents = stream.ReadToEnd();
                }

                string[] fileLines = fileContents.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries);

                bool          rulesApply = false;
                List <string> rules      = new List <string>();
                foreach (string line in fileLines)
                {
                    RobotInstruction ri = new RobotInstruction(line);
                    if (!ri.Instruction.IsNullOrEmpty())
                    {
                        switch (ri.Instruction[0])
                        {
                        case '#':                                 //then comment - ignore
                            break;

                        case 'u':                                 // User-Agent
                            if ((ri.UrlOrAgent.IndexOf("*") >= 0) || (ri.UrlOrAgent.IndexOf(m_WebDownloader.UserAgent) >= 0))
                            {
                                // these rules apply
                                rulesApply = true;
                            }
                            else
                            {
                                rulesApply = false;
                            }
                            break;

                        case 'd':                                 // Disallow
                            if (rulesApply)
                            {
                                rules.Add(ri.UrlOrAgent.ToUpperInvariant());
                            }
                            break;

                        case 'a':                                 // Allow
                            break;

                        default:
                            // empty/unknown/error
                            break;
                        }
                    }
                }

                m_DenyUrls = rules.ToArray();
            }
            catch (WebException)
            {
            }
            catch (SecurityException)
            {
            }
        }