Beispiel #1
0
 // ------------------------------------------------------------------
 /// <summary>
 /// Constructor.
 /// </summary>
 /// <param name="settings">The settings.</param>
 /// <param name="uriInfo">The URI info.</param>
 /// <param name="textContent">Content of the text.</param>
 public ResourceParser(
     SpiderSettings settings,
     UriResourceInformation uriInfo,
     string textContent)
 {
     _settings = settings;
     _uriInfo = uriInfo;
     _textContent = textContent;
 }
Beispiel #2
0
        /// <summary>
        /// Stores a binary resource to the local file system.
        /// </summary>
        /// <returns>Return the info about the stored data.</returns>
        public DownloadedResourceInformation StoreBinary(
			byte[] binaryContent,
			UriResourceInformation uriInfo )
        {
            DownloadedResourceInformation result =
                new DownloadedResourceInformation(
                uriInfo,
                _settings.Options.DestinationFolderPath );

            try
            {
                if ( result.LocalFilePath.Exists )
                {
                    result.LocalFilePath.Delete();
                }

                if ( binaryContent != null && binaryContent.Length > 0 )
                {
                    Console.WriteLine(
                        string.Format(
                        @"Writing binary content to file '{0}'.",
                        result.LocalFilePath ) );

                    using ( FileStream s = result.LocalFilePath.OpenWrite() )
                    {
                        s.Write( binaryContent, 0, binaryContent.Length );
                    }
                }
            }
            catch ( IOException x )
            {
                Console.WriteLine(
                    string.Format(
                    @"Ignoring IO exception while storing binary file: '{0}'.",
                    x.Message ) );
            }
            catch ( UnauthorizedAccessException x )
            {
                Console.WriteLine(
                    string.Format(
                    @"Ignoring exception while storing binary file: '{0}'.",
                    x.Message ) );
            }

            return result;
        }
        /// <summary>
        /// Replace URIs inside a given HTML document that was previously 
        /// downloaded with the local URIs.
        /// </summary>
        /// <returns>Returns the content text with the replaced links.</returns>
        public string ReplaceLinks(
            string textContent,
            UriResourceInformation uriInfo)
        {
            ResourceParser parser = new ResourceParser(
                _settings,
                uriInfo,
                textContent);

            List<UriResourceInformation> linkInfos =
                parser.ExtractLinks();

            // For remembering duplicates.
            Dictionary<string, string> replacedLinks =
                new Dictionary<string, string>();

            foreach (UriResourceInformation linkInfo in linkInfos)
            {
                if (linkInfo.WantFollowUri || linkInfo.IsResourceUri)
                {
                    DownloadedResourceInformation dlInfo =
                        new DownloadedResourceInformation(
                        linkInfo,
                        _settings.Options.DestinationFolderPath);

                    if (!string.IsNullOrEmpty(linkInfo.OriginalUrl))
                    {

                        string textContentBefore = textContent;

                        string link = Regex.Escape(linkInfo.OriginalUrl);

                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"""{0}""", link),
                            string.Format(@"""{0}""", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"'{0}'", link),
                            string.Format(@"'{0}'", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // For style-"url(...)"-links.
                        textContent = Regex.Replace(
                            textContent,
                            string.Format(@"\(\s*{0}\s*\)", link),
                            string.Format(@"({0})", dlInfo.LocalFileName),
                            RegexOptions.IgnoreCase | RegexOptions.Multiline);

                        // Some checking.
                        // 2007-07-27, Uwe Keim.
                        if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name &&
                            textContentBefore == textContent &&
                            !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath))
                        {
                            throw new ApplicationException(
                                string.Format(
                                    @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.",
                                    linkInfo.OriginalUrl,
                                    dlInfo.LocalFileName,
                                    textContent));
                        }
                        else
                        {
                            // Remember.
                            replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] =
                                linkInfo.AbsoluteUri.AbsolutePath;
                        }
                    }
                    //					*/
                }
            }

            // --

            return textContent;
        }
Beispiel #4
0
        // ------------------------------------------------------------------
        /// <summary>
        /// Does the extract links.
        /// </summary>
        /// <param name="xml">The XML.</param>
        /// <param name="uriInfo">The URI info.</param>
        /// <returns></returns>
        private List<UriResourceInformation> DoExtractLinks(
            XmlReader xml,
            UriResourceInformation uriInfo)
        {
            //Resulting resource list.
            List<UriResourceInformation> links = new List<UriResourceInformation>();
            //Loop through the HTML doc as Xml.
            while (xml.Read())
            {

                //Do something based on the element type.
                switch (xml.NodeType)
                {
                    //Grab inside comments, too.
                    case XmlNodeType.Comment:
                        XmlReader childXml =
                            GetDocReader(xml.Value, uriInfo.BaseUri);

                        //Grab links inside the comments
                        List<UriResourceInformation> childLinks =
                            DoExtractLinks(childXml, uriInfo);
                        links.AddRange(childLinks);
                        break;

                    // An HTML node element.
                    case XmlNodeType.Element:

                        //Temp link attributes holder.
                        string[] linkAttributeNames;
                        //Link types.
                        UriType linkType;

                        // If this is a link element(A, FORM, APPLET, REL), proceed to store the URLs to modify.
                        if (IsLinkElement(
                            xml.Name,
                            out linkAttributeNames,
                            out linkType))
                        {

                            //Loop through all the elements in the element.
                            while (xml.MoveToNextAttribute())
                            {
                                //Loop through each attribute of this (A, FORM, APPLET, REL) element.
                                foreach (string a in linkAttributeNames)
                                {
                                    //If the resource attribute matches, then add it.
                                    if (string.Compare(a, xml.Name, true) == 0)
                                    {

                                        string url = xml.Value;

                                        if (xml.Value.Contains("get_bill_text.asp"))
                                        {
                                            string sdfa = "Stop";
                                        }

                                        if (xml.Value.Contains("get_fulltext.asp"))
                                        {
                                            string adf = @"Stop";
                                        }

                                        //Save the Resource information
                                        UriResourceInformation ui = null;

                                        //Flag resource as a form.
                                        if (xml.Name == @"action")
                                            linkType = UriType.Form;

                                        //Create link
                                        ui = new UriResourceInformation(
                                            _settings.Options,
                                            url,
                                            new Uri(url, UriKind.RelativeOrAbsolute),
                                            uriInfo.BaseUriWithFolder,
                                            linkType,
                                            uriInfo.AbsoluteUri,
                                            uriInfo.Index);

                                        //Is in same domain
                                        bool isOnSameSite =
                                            ui.IsOnSameSite(uriInfo.BaseUri);

                                        //Stay on Site, and is processable.
                                        if ((isOnSameSite ||
                                            !_settings.Options.StayOnSite) &&
                                            ui.IsProcessableUri)
                                        {

                                            //Check to see if the link points to current session of legis
                                            if (ui.OriginalUrl.Contains(String.Format("session={0}",
                                                _settings.Options.TargetSession)))
                                            {
                                                //Add the resource.
                                                links.Add(ui);
                                            }

                                        }
                                    }
                                }
                            }
                        }
                        else
                        {
                            // Also, look for style attributes.
                            //while (xml.MoveToNextAttribute())
                            //{
                            //    links.AddRange(
                            //        ExtractStyleUrls(
                            //        uriInfo.BaseUriWithFolder,
                            //        xml.Name,
                            //        xml.Value));
                            //}
                        }
                        break;
                }
            }

            if (links.ToArray().Length > 0)
            {
                string stp = @"stop";
            }

            return links;
        }
Beispiel #5
0
        /// <summary>
        /// Do the Collector extraction.
        /// </summary>
        /// <param name="xml"></param>
        /// <param name="_uriInfo"></param>
        /// <param name="req"></param>
        /// <returns></returns>
        private List<iCollector> DoExtractCollector(XmlReader xml, UriResourceInformation _uriInfo, List<iCollector> req)
        {
            //Process the iCollector Collect method, on all requests.
            foreach (iCollector col in req)
                col.Collect(xml, _uriInfo);

            return req.ToList();
        }
Beispiel #6
0
        /// <summary>
        /// Stores a HTML resource to the local file system.
        /// Does no hyperlink replacement.
        /// </summary>
        /// <returns>Return the info about the stored data.</returns>
        public DownloadedResourceInformation StoreHtml(
			string textContent,
			Encoding encoding,
			UriResourceInformation uriInfo )
        {
            DownloadedResourceInformation result =
                new DownloadedResourceInformation(
                uriInfo,
                _settings.Options.DestinationFolderPath );

            try
            {
                if ( result.LocalFilePath.Exists )
                {
                    result.LocalFilePath.Delete();
                }

                if ( !result.LocalFilePath.Directory.Exists )
                {
                    result.LocalFilePath.Directory.Create();
                }

                Console.WriteLine(
                    string.Format(
                    @"Writing text content to file '{0}'.",
                    result.LocalFilePath ) );

                using ( FileStream s = new FileStream(
                    result.LocalFilePath.FullName,
                    FileMode.Create,
                    FileAccess.Write ) )
                using ( StreamWriter w = new StreamWriter( s, encoding ) )
                {
                    w.Write( textContent );
                }
            }
            catch ( IOException x )
            {
                Console.WriteLine(
                    string.Format(
                    @"Ignoring IO exception while storing HTML file: '{0}'.",
                    x.Message ) );
            }
            catch ( UnauthorizedAccessException x )
            {
                Console.WriteLine(
                    string.Format(
                    @"Ignoring exception while storing HTML file: '{0}'.",
                    x.Message ) );
            }

            return result;
        }
 /// <summary>
 /// Initializes a new instance of the 
 /// <see cref="UriResourceInformation"/> class.
 /// </summary>
 /// <param name="copyFrom">The copy from.</param>
 public UriResourceInformation(
     UriResourceInformation copyFrom)
 {
     _options = copyFrom._options;
     _originalUrl = copyFrom._originalUrl;
     _relativeUri = copyFrom._relativeUri;
     _baseUri = copyFrom._baseUri;
     _absoluteUri = copyFrom._absoluteUri;
     _linkType = copyFrom._linkType;
     _parentUri = copyFrom._parentUri;
 }