void ReadToEndTag(HtmlReader reader, string endTag) { while (!reader.EOF) { var n = reader.NextNode(); if (n.Value == endTag && n.Type == HtmlTokenType.EndTag) { return; } } }
string ReadTitle(HtmlReader reader, string endTag) { string title = ""; while (!reader.EOF) { var node = reader.NextNode(); if (node.Type == HtmlTokenType.Text) { title += node.Value; } if (node.Type == HtmlTokenType.EndTag && node.Value == "title") { return(SanitiseAttribute(title, 128)); } } return(SanitiseAttribute(title, 128)); }
public override async Task ProcessResponseStream(Stream dataStream) { if (dataStream == null) { return; } var ms = new MemoryStream(); dataStream.CopyTo(ms); ms.Seek(0, SeekOrigin.Begin); if (ContentType.IsUnknownOrNull(ContentType)) { ContentType = ContentType.Guess(ms); } ms.Seek(0, SeekOrigin.Begin); if (ContentType.IsUnknownOrNull(ContentType) || ContentType.MediaType == "text") { using (var p = new HtmlReader(ms)) { while (!p.EOF) { var node = p.NextNode(); if (node.Type == HtmlTokenType.StartTag) { if (node.Value == "script") { string src = p.GetAttribute("src"); if (!string.IsNullOrWhiteSpace(src)) { Console.WriteLine(src); } } } } } } }
public Uri NextUri() { Uri uri = null; while (!reader.EOF && uri == null) { var node = reader.NextNode(); if (node.Value == "a") { var href = reader.GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { var links = Utility.GetLinks(CurrentSourceUri, href); if (links.Any()) { uri = links.First(); } } } else if (node.Value == "script") { var href = reader.GetAttribute("src"); if (!string.IsNullOrWhiteSpace(href)) { var links = Utility.GetLinks(CurrentSourceUri, href); if (links.Any()) { uri = links.First(); } } } else if (node.Value == "link") { var href = reader.GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { var links = Utility.GetLinks(CurrentSourceUri, href); if (links.Any()) { uri = links.First(); } } } else if (node.Value == "base") { var href = reader.GetAttribute("href"); if (!string.IsNullOrWhiteSpace(href)) { // sometimes the links are bogus?! var l = Utility.GetLinks(null, href); if (l.Any()) { CurrentSourceUri = l.First(); } } } } return(uri); }
public WorkspaceResult Build(Stream stream, string requestString, string responseHeaders, out string evalText) { WorkspaceResult result = new WorkspaceResult(); result.SourceServerId = FetchoConfiguration.Current.CurrentServerNode.ServerId; if (!stream.CanSeek) { throw new FetchoException("WorkspaceResultBuilder needs a seekable stream"); } ProcessHeaders(result, requestString, responseHeaders); result.DataHash = MD5Hash.Compute(stream).ToString(); result.PageSize = stream.Length; stream.Seek(0, SeekOrigin.Begin); ContentType contentType = GetContentType(result); int titleness = 4; if (contentType != null) { if (contentType.SubType.Contains("html")) { using (var reader = new HtmlReader(stream)) { while (!reader.EOF) { var node = reader.NextNode(); if (node.Type == HtmlTokenType.Text) { evaluationText.Append(node.Value); evaluationText.Append(' '); } if (node.Value == "script") { ReadToEndTag(reader, "script"); } else if (node.Value == "style") { ReadToEndTag(reader, "style"); } else if (node.Value == "title" && titleness > 1 && !result.PropertyCache.ContainsKey("title")) { string title = ReadTitle(reader, "title"); result.PropertyCache.Add("title", title); titleness = 1; } else if (node.Value == "h1" && titleness > 2 && !result.PropertyCache.ContainsKey("title")) { string title = ReadTitle(reader, "h1"); result.PropertyCache.Add("title", title); titleness = 2; } else if (node.Value == "h2" && titleness > 3 && !result.PropertyCache.ContainsKey("title")) { string title = ReadTitle(reader, "h2"); result.PropertyCache.Add("title", title); titleness = 3; } else if (node.Value == "meta") { ProcessMetaTag(reader, result); } } } } else if (contentType.IsTextType || ContentType.IsJavascriptContentType(contentType)) { // leave the stream open so other tasks can reset it and use it using (var reader = new StreamReader(stream, Encoding.Default, true, 1024, true)) { evaluationText.Append(reader.ReadToEnd()); } } } result.UriHash = MD5Hash.Compute(result.RequestProperties.SafeGet("uri") ?? "").ToString(); result.RefererUri = result.RequestProperties.SafeGet("referer"); result.Uri = result.RequestProperties.SafeGet("uri") ?? ""; result.Title = result.PropertyCache.SafeGet("title")?.ToString(); result.Description = result.PropertyCache.SafeGet("description")?.ToString(); result.Created = DateTime.UtcNow; result.Updated = DateTime.UtcNow; evalText = evaluationText.ToString(); return(result); }