Example #1
0
        void ReadToEndTag(HtmlReader reader, string endTag)
        {
            while (!reader.EOF)
            {
                var n = reader.NextNode();

                if (n.Value == endTag && n.Type == HtmlTokenType.EndTag)
                {
                    return;
                }
            }
        }
Example #2
0
        string ReadTitle(HtmlReader reader, string endTag)
        {
            string title = "";

            while (!reader.EOF)
            {
                var node = reader.NextNode();

                if (node.Type == HtmlTokenType.Text)
                {
                    title += node.Value;
                }

                if (node.Type == HtmlTokenType.EndTag && node.Value == "title")
                {
                    return(SanitiseAttribute(title, 128));
                }
            }

            return(SanitiseAttribute(title, 128));
        }
        public override async Task ProcessResponseStream(Stream dataStream)
        {
            if (dataStream == null)
            {
                return;
            }
            var ms = new MemoryStream();

            dataStream.CopyTo(ms);
            ms.Seek(0, SeekOrigin.Begin);
            if (ContentType.IsUnknownOrNull(ContentType))
            {
                ContentType = ContentType.Guess(ms);
            }

            ms.Seek(0, SeekOrigin.Begin);

            if (ContentType.IsUnknownOrNull(ContentType) || ContentType.MediaType == "text")
            {
                using (var p = new HtmlReader(ms))
                {
                    while (!p.EOF)
                    {
                        var node = p.NextNode();

                        if (node.Type == HtmlTokenType.StartTag)
                        {
                            if (node.Value == "script")
                            {
                                string src = p.GetAttribute("src");
                                if (!string.IsNullOrWhiteSpace(src))
                                {
                                    Console.WriteLine(src);
                                }
                            }
                        }
                    }
                }
            }
        }
        public Uri NextUri()
        {
            Uri uri = null;

            while (!reader.EOF && uri == null)
            {
                var node = reader.NextNode();

                if (node.Value == "a")
                {
                    var href = reader.GetAttribute("href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        var links = Utility.GetLinks(CurrentSourceUri, href);
                        if (links.Any())
                        {
                            uri = links.First();
                        }
                    }
                }
                else if (node.Value == "script")
                {
                    var href = reader.GetAttribute("src");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        var links = Utility.GetLinks(CurrentSourceUri, href);
                        if (links.Any())
                        {
                            uri = links.First();
                        }
                    }
                }
                else if (node.Value == "link")
                {
                    var href = reader.GetAttribute("href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        var links = Utility.GetLinks(CurrentSourceUri, href);
                        if (links.Any())
                        {
                            uri = links.First();
                        }
                    }
                }
                else if (node.Value == "base")
                {
                    var href = reader.GetAttribute("href");
                    if (!string.IsNullOrWhiteSpace(href))
                    {
                        // sometimes the links are bogus?!
                        var l = Utility.GetLinks(null, href);
                        if (l.Any())
                        {
                            CurrentSourceUri = l.First();
                        }
                    }
                }
            }

            return(uri);
        }
Example #5
0
        public WorkspaceResult Build(Stream stream, string requestString, string responseHeaders, out string evalText)
        {
            WorkspaceResult result = new WorkspaceResult();

            result.SourceServerId = FetchoConfiguration.Current.CurrentServerNode.ServerId;

            if (!stream.CanSeek)
            {
                throw new FetchoException("WorkspaceResultBuilder needs a seekable stream");
            }

            ProcessHeaders(result, requestString, responseHeaders);
            result.DataHash = MD5Hash.Compute(stream).ToString();
            result.PageSize = stream.Length;
            stream.Seek(0, SeekOrigin.Begin);

            ContentType contentType = GetContentType(result);
            int         titleness   = 4;

            if (contentType != null)
            {
                if (contentType.SubType.Contains("html"))
                {
                    using (var reader = new HtmlReader(stream))
                    {
                        while (!reader.EOF)
                        {
                            var node = reader.NextNode();
                            if (node.Type == HtmlTokenType.Text)
                            {
                                evaluationText.Append(node.Value);
                                evaluationText.Append(' ');
                            }

                            if (node.Value == "script")
                            {
                                ReadToEndTag(reader, "script");
                            }
                            else if (node.Value == "style")
                            {
                                ReadToEndTag(reader, "style");
                            }
                            else if (node.Value == "title" && titleness > 1 && !result.PropertyCache.ContainsKey("title"))
                            {
                                string title = ReadTitle(reader, "title");
                                result.PropertyCache.Add("title", title);
                                titleness = 1;
                            }
                            else if (node.Value == "h1" && titleness > 2 && !result.PropertyCache.ContainsKey("title"))
                            {
                                string title = ReadTitle(reader, "h1");
                                result.PropertyCache.Add("title", title);
                                titleness = 2;
                            }
                            else if (node.Value == "h2" && titleness > 3 && !result.PropertyCache.ContainsKey("title"))
                            {
                                string title = ReadTitle(reader, "h2");
                                result.PropertyCache.Add("title", title);
                                titleness = 3;
                            }
                            else if (node.Value == "meta")
                            {
                                ProcessMetaTag(reader, result);
                            }
                        }
                    }
                }
                else if (contentType.IsTextType || ContentType.IsJavascriptContentType(contentType))
                {
                    // leave the stream open so other tasks can reset it and use it
                    using (var reader = new StreamReader(stream, Encoding.Default, true, 1024, true))
                    {
                        evaluationText.Append(reader.ReadToEnd());
                    }
                }
            }

            result.UriHash     = MD5Hash.Compute(result.RequestProperties.SafeGet("uri") ?? "").ToString();
            result.RefererUri  = result.RequestProperties.SafeGet("referer");
            result.Uri         = result.RequestProperties.SafeGet("uri") ?? "";
            result.Title       = result.PropertyCache.SafeGet("title")?.ToString();
            result.Description = result.PropertyCache.SafeGet("description")?.ToString();
            result.Created     = DateTime.UtcNow;
            result.Updated     = DateTime.UtcNow;

            evalText = evaluationText.ToString();

            return(result);
        }