public void Process(Crawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsTextContent(propertyBag.ContentType)) { return; } using (MemoryStream reader = propertyBag.GetResponseStream()) { string content = reader.ReadToEnd(); propertyBag.Text = content.Trim(); } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string content = propertyBag.Text; if (content.IsNullOrEmpty()) { return; } string contentLookupText = content.Length > MaxPostSize ? content.Substring(0, MaxPostSize).Trim() : content.Trim(); string encodedRequestUrlFragment = "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q={0}".FormatWith(contentLookupText); IWebDownloader downloader = m_DownloaderFactory.GetDownloader(); PropertyBag result = downloader.Download(new CrawlStep(new Uri(encodedRequestUrlFragment), 0), DownloadMethod.Get); using (MemoryStream responseReader = result.GetResponseStream()) using (StreamReader reader = new StreamReader(responseReader)) { string json = reader.ReadLine(); using (MemoryStream ms = new MemoryStream(Encoding.Unicode.GetBytes(json))) { DataContractJsonSerializer ser = new DataContractJsonSerializer(typeof(LanguageDetector)); LanguageDetector detector = ser.ReadObject(ms) as LanguageDetector; if (!detector.IsNull()) { CultureInfo culture = CultureInfo.GetCultureInfo(detector.responseData.language); propertyBag["Language"].Value = detector.responseData.language; propertyBag["LanguageCulture"].Value = culture; } } } }
public void Process(Crawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return; } if (!IsHtmlContent(propertyBag.ContentType)) { return; } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream reader = propertyBag.GetResponseStream()) { Encoding documentEncoding = htmlDoc.DetectEncoding(reader); reader.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(reader, documentEncoding, true); } else { htmlDoc.Load(reader, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select name.Value + ": " + content.Value).ToArray(); } propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.AddStep(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, link }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }); } }
private void Initialize() { try { Uri robotsUri = new Uri("http://{0}/robots.txt".FormatWith(m_StartPageUri.Host)); PropertyBag robots = m_WebDownloader.Download(new CrawlStep(robotsUri, 0), DownloadMethod.Get); if (robots.StatusCode != HttpStatusCode.OK) { return; } string fileContents; using (StreamReader stream = new StreamReader(robots.GetResponseStream(), Encoding.ASCII)) { fileContents = stream.ReadToEnd(); } string[] fileLines = fileContents.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); bool rulesApply = false; List <string> rules = new List <string>(); foreach (string line in fileLines) { RobotInstruction ri = new RobotInstruction(line); if (!ri.Instruction.IsNullOrEmpty()) { switch (ri.Instruction[0]) { case '#': //then comment - ignore break; case 'u': // User-Agent if ((ri.UrlOrAgent.IndexOf("*") >= 0) || (ri.UrlOrAgent.IndexOf(m_WebDownloader.UserAgent) >= 0)) { // these rules apply rulesApply = true; } else { rulesApply = false; } break; case 'd': // Disallow if (rulesApply) { rules.Add(ri.UrlOrAgent.ToUpperInvariant()); } break; case 'a': // Allow break; default: // empty/unknown/error break; } } } m_DenyUrls = rules.ToArray(); } catch (WebException) { } catch (SecurityException) { } }