// ------------------------------------------------------------------ /// <summary> /// Constructor. /// </summary> /// <param name="settings">The settings.</param> /// <param name="uriInfo">The URI info.</param> /// <param name="textContent">Content of the text.</param> public ResourceParser( SpiderSettings settings, UriResourceInformation uriInfo, string textContent) { _settings = settings; _uriInfo = uriInfo; _textContent = textContent; }
/// <summary> /// Stores a binary resource to the local file system. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreBinary( byte[] binaryContent, UriResourceInformation uriInfo ) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath ); try { if ( result.LocalFilePath.Exists ) { result.LocalFilePath.Delete(); } if ( binaryContent != null && binaryContent.Length > 0 ) { Console.WriteLine( string.Format( @"Writing binary content to file '{0}'.", result.LocalFilePath ) ); using ( FileStream s = result.LocalFilePath.OpenWrite() ) { s.Write( binaryContent, 0, binaryContent.Length ); } } } catch ( IOException x ) { Console.WriteLine( string.Format( @"Ignoring IO exception while storing binary file: '{0}'.", x.Message ) ); } catch ( UnauthorizedAccessException x ) { Console.WriteLine( string.Format( @"Ignoring exception while storing binary file: '{0}'.", x.Message ) ); } return result; }
/// <summary> /// Replace URIs inside a given HTML document that was previously /// downloaded with the local URIs. /// </summary> /// <returns>Returns the content text with the replaced links.</returns> public string ReplaceLinks( string textContent, UriResourceInformation uriInfo) { ResourceParser parser = new ResourceParser( _settings, uriInfo, textContent); List<UriResourceInformation> linkInfos = parser.ExtractLinks(); // For remembering duplicates. Dictionary<string, string> replacedLinks = new Dictionary<string, string>(); foreach (UriResourceInformation linkInfo in linkInfos) { if (linkInfo.WantFollowUri || linkInfo.IsResourceUri) { DownloadedResourceInformation dlInfo = new DownloadedResourceInformation( linkInfo, _settings.Options.DestinationFolderPath); if (!string.IsNullOrEmpty(linkInfo.OriginalUrl)) { string textContentBefore = textContent; string link = Regex.Escape(linkInfo.OriginalUrl); textContent = Regex.Replace( textContent, string.Format(@"""{0}""", link), string.Format(@"""{0}""", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); textContent = Regex.Replace( textContent, string.Format(@"'{0}'", link), string.Format(@"'{0}'", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // For style-"url(...)"-links. textContent = Regex.Replace( textContent, string.Format(@"\(\s*{0}\s*\)", link), string.Format(@"({0})", dlInfo.LocalFileName), RegexOptions.IgnoreCase | RegexOptions.Multiline); // Some checking. // 2007-07-27, Uwe Keim. if (linkInfo.OriginalUrl != dlInfo.LocalFileName.Name && textContentBefore == textContent && !replacedLinks.ContainsKey(linkInfo.AbsoluteUri.AbsolutePath)) { throw new ApplicationException( string.Format( @"Failed to replace URI '{0}' with URI '{1}' in HTML text '{2}'.", linkInfo.OriginalUrl, dlInfo.LocalFileName, textContent)); } else { // Remember. replacedLinks[linkInfo.AbsoluteUri.AbsolutePath] = linkInfo.AbsoluteUri.AbsolutePath; } } // */ } } // -- return textContent; }
// ------------------------------------------------------------------ /// <summary> /// Does the extract links. /// </summary> /// <param name="xml">The XML.</param> /// <param name="uriInfo">The URI info.</param> /// <returns></returns> private List<UriResourceInformation> DoExtractLinks( XmlReader xml, UriResourceInformation uriInfo) { //Resulting resource list. List<UriResourceInformation> links = new List<UriResourceInformation>(); //Loop through the HTML doc as Xml. while (xml.Read()) { //Do something based on the element type. switch (xml.NodeType) { //Grab inside comments, too. case XmlNodeType.Comment: XmlReader childXml = GetDocReader(xml.Value, uriInfo.BaseUri); //Grab links inside the comments List<UriResourceInformation> childLinks = DoExtractLinks(childXml, uriInfo); links.AddRange(childLinks); break; // An HTML node element. case XmlNodeType.Element: //Temp link attributes holder. string[] linkAttributeNames; //Link types. UriType linkType; // If this is a link element(A, FORM, APPLET, REL), proceed to store the URLs to modify. if (IsLinkElement( xml.Name, out linkAttributeNames, out linkType)) { //Loop through all the elements in the element. while (xml.MoveToNextAttribute()) { //Loop through each attribute of this (A, FORM, APPLET, REL) element. foreach (string a in linkAttributeNames) { //If the resource attribute matches, then add it. if (string.Compare(a, xml.Name, true) == 0) { string url = xml.Value; if (xml.Value.Contains("get_bill_text.asp")) { string sdfa = "Stop"; } if (xml.Value.Contains("get_fulltext.asp")) { string adf = @"Stop"; } //Save the Resource information UriResourceInformation ui = null; //Flag resource as a form. if (xml.Name == @"action") linkType = UriType.Form; //Create link ui = new UriResourceInformation( _settings.Options, url, new Uri(url, UriKind.RelativeOrAbsolute), uriInfo.BaseUriWithFolder, linkType, uriInfo.AbsoluteUri, uriInfo.Index); //Is in same domain bool isOnSameSite = ui.IsOnSameSite(uriInfo.BaseUri); //Stay on Site, and is processable. if ((isOnSameSite || !_settings.Options.StayOnSite) && ui.IsProcessableUri) { //Check to see if the link points to current session of legis if (ui.OriginalUrl.Contains(String.Format("session={0}", _settings.Options.TargetSession))) { //Add the resource. links.Add(ui); } } } } } } else { // Also, look for style attributes. //while (xml.MoveToNextAttribute()) //{ // links.AddRange( // ExtractStyleUrls( // uriInfo.BaseUriWithFolder, // xml.Name, // xml.Value)); //} } break; } } if (links.ToArray().Length > 0) { string stp = @"stop"; } return links; }
/// <summary> /// Do the Collector extraction. /// </summary> /// <param name="xml"></param> /// <param name="_uriInfo"></param> /// <param name="req"></param> /// <returns></returns> private List<iCollector> DoExtractCollector(XmlReader xml, UriResourceInformation _uriInfo, List<iCollector> req) { //Process the iCollector Collect method, on all requests. foreach (iCollector col in req) col.Collect(xml, _uriInfo); return req.ToList(); }
/// <summary> /// Stores a HTML resource to the local file system. /// Does no hyperlink replacement. /// </summary> /// <returns>Return the info about the stored data.</returns> public DownloadedResourceInformation StoreHtml( string textContent, Encoding encoding, UriResourceInformation uriInfo ) { DownloadedResourceInformation result = new DownloadedResourceInformation( uriInfo, _settings.Options.DestinationFolderPath ); try { if ( result.LocalFilePath.Exists ) { result.LocalFilePath.Delete(); } if ( !result.LocalFilePath.Directory.Exists ) { result.LocalFilePath.Directory.Create(); } Console.WriteLine( string.Format( @"Writing text content to file '{0}'.", result.LocalFilePath ) ); using ( FileStream s = new FileStream( result.LocalFilePath.FullName, FileMode.Create, FileAccess.Write ) ) using ( StreamWriter w = new StreamWriter( s, encoding ) ) { w.Write( textContent ); } } catch ( IOException x ) { Console.WriteLine( string.Format( @"Ignoring IO exception while storing HTML file: '{0}'.", x.Message ) ); } catch ( UnauthorizedAccessException x ) { Console.WriteLine( string.Format( @"Ignoring exception while storing HTML file: '{0}'.", x.Message ) ); } return result; }
/// <summary> /// Initializes a new instance of the /// <see cref="UriResourceInformation"/> class. /// </summary> /// <param name="copyFrom">The copy from.</param> public UriResourceInformation( UriResourceInformation copyFrom) { _options = copyFrom._options; _originalUrl = copyFrom._originalUrl; _relativeUri = copyFrom._relativeUri; _baseUri = copyFrom._baseUri; _absoluteUri = copyFrom._absoluteUri; _linkType = copyFrom._linkType; _parentUri = copyFrom._parentUri; }