/// <summary> /// Raises an ExtractContentComplete event when the extraction of content is complete /// </summary> /// <param name="e">The <see cref="ParserEventArgs"/> related to the event.</param> private void OnExtractContentComplete(ParserEventArgs e) { if (ExtractContentComplete != null) { ExtractContentComplete(this, e); } }
/// <summary> /// Extracts links from the contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <param name="contentUrl">The url of the document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return(links); }
/// <summary> /// Performs the extraction of content from a document. /// </summary> /// <param name="content"> /// The contents of the document from which the content must be extracted. /// </param> /// <param name="Flag">The parameter is not used in this method.</param> /// <returns>A string containing the desired extracted content.</returns> /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks> public override string ExtractContent(byte[] content, bool Flag) { string retVal = String.Empty; ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractContentComplete(e); return(retVal); }
/// <summary> /// Extracts text from the contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <returns>The text extracted from the document.</returns> /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks> public override string ExtractText(ref string content) { string retVal = String.Empty; ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractTextComplete(e); return(retVal); }
/// <summary> /// Performs the extraction of text from a text document. The text is extracted by /// compacting consecutive white space characters. /// </summary> /// <param name="content"> /// The contents of the document from which the text must be extracted. Passes by /// reference in order to reduce memory consumption. /// </param> /// <returns>A string containing the 'clean' text extracted from the document.</returns> public override string ExtractText(ref string content) { string retVal = String.Empty; StringBuilder sb = new StringBuilder(content); sb.Replace("\r\n", " "); sb.Replace('\n', ' '); sb.Replace('\t', ' '); retVal = spacesRegex.Replace(sb.ToString(), " "); ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractTextComplete(e); return(retVal); }
private void crawler_UrlProcessed(object sender, ParserEventArgs e) { log.LogInfo("Crawler processed url " + e.Url); }
/// <summary> /// Extracts the desired contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <param name="Flag">The parameter is not used in this method.</param> /// <returns>The contents extracted from the document.</returns> /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks> public override string ExtractContent(ref string content, bool Flag) { string retVal = String.Empty; ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractContentComplete(e); return retVal; }
/// <summary> /// Extracts text from the contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <returns>The text extracted from the document.</returns> /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks> public override string ExtractText(byte[] content) { string retVal = String.Empty; ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractTextComplete(e); return retVal; }
/// <summary> /// Extracts links from the contents of a document. /// </summary> /// <param name="content">The contents of the document.</param> /// <param name="contentUrl">The url of the document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links=new ArrayList(); ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return links; }
/// <summary> /// Extracts text from the contents of a SWF document. /// </summary> /// <param name="content">The contents of the SWF document.</param> /// <returns>The text extracted from the SWF document.</returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> /// <remarks> /// </remarks> public override string ExtractText(byte[] content) { string retVal = String.Empty; if((content==null)||(content.Length==0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string swfFileName = FileName + ".swf"; string htmFileName = FileName + ".htm"; FileStream swf = null; StreamReader htm = null; try { //store the swf file swf = new FileStream(swfFileName,FileMode.Create); swf.Write(content, 0, content.Length); swf.Close(); swf = null; //convert it to html bool success = converter.ConvertSwfFile(swfFileName, htmFileName); if(success) { htm = new StreamReader(htmFileName, encoding); string html = htm.ReadToEnd(); htm.Close(); htm = null; retVal = parser.ExtractText(ref html); } } catch(Exception ex) { if(swf!=null) { try { swf.Close(); } catch {} } if(htm!=null) { try { htm.Close(); } catch {} } if(globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("SwfParser failed to extract text: " + ex.ToString()); } } finally { File.Delete(swfFileName); File.Delete(htmFileName); } } catch(Exception ex) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("SwfParser failed to extract text: " + ex.Message); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractTextComplete(e); return retVal; }
/// <summary> /// Performs the extraction of text from a text document. The text is extracted by /// compacting consecutive white space characters. /// </summary> /// <param name="content"> /// The contents of the document from which the text must be extracted. Passes by /// reference in order to reduce memory consumption. /// </param> /// <returns>A string containing the 'clean' text extracted from the document.</returns> public override string ExtractText(ref string content) { string retVal = String.Empty; StringBuilder sb=new StringBuilder(content); sb.Replace("\r\n", " "); sb.Replace('\n', ' '); sb.Replace('\t', ' '); retVal = spacesRegex.Replace(sb.ToString()," "); ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractTextComplete(e); return retVal; }
/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. It is passed by /// reference in order to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); // It is important to notice that if the FlagFetchRobots of the contentUrl is // true then the TextParser must remember this value because during the Robots // Filtering it will become false so as not to download the robots.txt file // every time a Url must be filtered. //bool FlagFetchRobots = contentUrl.FlagFetchRobots; try { //make sure only one thread will parse contents at a time. //mutex.WaitOne(); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) { contentUrl.FlagDomain = ExtractDomainFlag(ref content); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) { if (InternetUtils.HostName(contentUrl).Contains("ebay.com")) { contentUrl.FlagDomain = DomainFlagValue.MustVisit; } } } //perform the hyperlink matching MatchCollection matches = hrefRegex.Matches(content); if (matches.Count > 0) { string documentUrl = contentUrl.Url; string baseUrl = BaseUrl(ref documentUrl); byte priority = 0; foreach (Match m in matches) { try { string url = m.Value.Trim(); url = NormalizeUrl(ref url, ref baseUrl); priority = CleanUrlParams(ref url); if (FilterUrl(ref url, ref documentUrl)) { InternetUrlToIndex iurl = new InternetUrlToIndex(url); iurl.Priority = priority; iurl.FlagDomain = domainFilter.FilterUrl(ref url); //[mod 24/2/05] No robots.txt checking is performed for non-greek urls if (iurl.FlagDomain == DomainFlagValue.MustVisit) { iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta); } else { iurl.FlagRobots = false; } if (!links.Contains(iurl)) { links.Add(iurl); } } } catch { if (globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("TextParser failed to parse " + m.Value); } continue; } } } } catch (Exception ex) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning(ex.Message); } } finally { //mutex.ReleaseMutex(); } //contentUrl.FlagFetchRobots = FlagFetchRobots; ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); links.TrimToSize(); return(links); }
/// <summary> /// Raises the <see cref="UrlProcessed"/> event /// </summary> /// <param name="e">The <see cref="ParserEventArgs"/> related to the event.</param> private void OnUrlProcessed(ParserEventArgs e) { if(UrlProcessed != null) { UrlProcessed(this, e); } }
/// <summary> /// Extracts text from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <returns>The text extracted from the PDF document.</returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> /// <remarks> /// </remarks> public override string ExtractText(byte[] content) { string retVal = String.Empty; if((content==null)||(content.Length==0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string pdfFileName = FileName + ".pdf"; string txtFileName = FileName + ".txt"; FileStream pdf = null; StreamReader txt = null; try { //store the pdf file pdf = new FileStream(pdfFileName,FileMode.Create); pdf.Write(content, 0, content.Length); pdf.Close(); pdf = null; bool success = false; //convert it to text try { converter.loadFile(pdfFileName); converter.convertToTextFile(1, converter.numPages, txtFileName); success = true; } catch { success = false; } finally { converter.closeFile(); } if(success) { txt = new StreamReader(txtFileName, encoding); string text = txt.ReadToEnd(); txt.Close(); txt = null; retVal = parser.ExtractText(ref text); } else { txt = null; } } catch(Exception ex) { if(pdf!=null) { try { pdf.Close(); } catch {} } if(txt!=null) { try { txt.Close(); } catch {} } if(globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("PdfParser failed to extract text: " + ex.ToString()); } } finally { File.Delete(pdfFileName); File.Delete(txtFileName); } } catch(Exception ex) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("PdfParser failed to extract text: " + ex.Message); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractTextComplete(e); return retVal; }
/// <summary> /// Performs the extraction of content from a SWF document. Depending on the value /// of the Flag provided it simply returns a string containing the HTML format of /// the input or it returns the text format of the input after performing a white /// space compaction. /// </summary> /// <param name="content"> /// The contents of the document from which the content must be extracted. /// </param> /// <param name="Flag">Determines what kind of processing will be performed on the /// input. If set to false it simply returns a string containing the HTML format of /// the input. If set to true it returns the text format of the input after performing /// a white space compaction. /// </param> /// <returns>A string containing the desired extracted content.</returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override string ExtractContent(byte[] content, bool Flag) { string retVal = null; if ((content == null) || (content.Length == 0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string swfFileName = FileName + ".swf"; string htmFileName = FileName + ".htm"; FileStream swf = null; StreamReader htm = null; try { //store the swf file swf = new FileStream(swfFileName, FileMode.Create); swf.Write(content, 0, content.Length); swf.Close(); swf = null; //convert it to html bool success = converter.ConvertSwfFile(swfFileName, htmFileName); if (success) { htm = new StreamReader(htmFileName, encoding); string html = htm.ReadToEnd(); htm.Close(); htm = null; if (!Flag) { retVal = html; } else { retVal = parser.ExtractText(ref html); } } } catch (Exception ex) { if (swf != null) { try { swf.Close(); } catch {} } if (htm != null) { try { htm.Close(); } catch {} } if (globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("SwfParser failed to extract text: " + ex.ToString()); } } finally { File.Delete(swfFileName); File.Delete(htmFileName); } } catch (Exception ex) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("SwfParser failed to extract content: " + ex.Message); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractContentComplete(e); return(retVal); }
/// <summary> /// Performs the extraction of content from a PDF document. Depending on the value /// of the Flag provided it simply returns a string same as the text produced from /// the parsing of the PDF document or it removes consecutive whitespace characters /// in order to perform a compaction. /// </summary> /// <param name="content"> /// The contents of the document from which the content must be extracted. /// </param> /// <param name="Flag">Determines what kind of processing will be performed on the /// input. If set to false it simply returns a string same as the text produced from /// the parsing of the PDF document. If set to true it removes consecutive white /// space characters in order to perform a compaction. /// </param> /// <returns>A string containing the desired extracted content.</returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override string ExtractContent(byte[] content, bool Flag) { string retVal = String.Empty; if ((content == null) || (content.Length == 0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string pdfFileName = FileName + ".pdf"; string txtFileName = FileName + ".txt"; FileStream pdf = null; StreamReader txt = null; try { //store the pdf file pdf = new FileStream(pdfFileName, FileMode.Create); pdf.Write(content, 0, content.Length); pdf.Close(); pdf = null; //convert it to text int success = converter.ConvertPdf2Text(pdfFileName, txtFileName); if (success == 0) { txt = new StreamReader(txtFileName, encoding); if (!Flag) { retVal = txt.ReadToEnd(); } else { string text = txt.ReadToEnd(); retVal = parser.ExtractText(ref text); } txt.Close(); txt = null; } } catch (Exception ex) { if (pdf != null) { try { pdf.Close(); } catch {} } if (txt != null) { try { txt.Close(); } catch {} } if (globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("PdfParser failed to extract content: " + ex.ToString()); } } finally { File.Delete(pdfFileName); File.Delete(txtFileName); } } catch (Exception ex) { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("PdfParser failed to extract content: " + ex.Message); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(String.Empty); OnExtractContentComplete(e); return(retVal); }
/// <summary> /// Extracts links from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links=null; if((content==null)||(content.Length==0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string pdfFileName = FileName + ".pdf"; string txtFileName = FileName + ".txt"; FileStream pdf = null; StreamReader txt = null; try { //store the pdf file pdf = new FileStream(pdfFileName,FileMode.Create); pdf.Write(content, 0, content.Length); pdf.Close(); pdf = null; //convert it to text int success = converter.ConvertPdf2Text(pdfFileName, txtFileName); if(success==0) { txt = new StreamReader(txtFileName, encoding); string text = txt.ReadToEnd(); txt.Close(); txt = null; links = parser.ExtractLinks(ref text, ref contentUrl); } else { txt.Close(); txt = null; } } catch(Exception ex) { if(pdf!=null) { try { pdf.Close(); } catch {} } if(txt!=null) { try { txt.Close(); } catch {} } if(globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString()); } } finally { File.Delete(pdfFileName); File.Delete(txtFileName); } } catch { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return links; }
/// <summary> /// Raises an ExtractTextComplete event when the extraction of text is complete /// </summary> /// <param name="e">The <see cref="ParserEventArgs"/> related to the event.</param> private void OnExtractTextComplete(ParserEventArgs e) { if(ExtractTextComplete!=null) { ExtractTextComplete(this, e); } }
/// <summary> /// Performs the extraction of links from a text document. It can extract simple /// links that are separated from the rest of the text using spaces or line brakes /// or any other delimiters. The results are returned as an <see cref="ArrayList"/> /// of <see cref="InternetUrlToIndex"/> objects. /// </summary> /// <remarks> /// Besides the parsing and extraction of Urls, ExtractLinks also performs other /// tasks as well, such as:<br/> /// <list type="bullet"> /// <item> /// <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description> /// </item> /// <item> /// <description>Filtering of multimple links to the same url and to the document itself.</description> /// </item> /// <item> /// <description>Filtering of session id variables in dynamic Urls and limiting /// of the number of GET variables in dynamic Urls.</description> /// </item> /// <item> /// <description>Flagging of Urls according to their country domain.</description> /// </item> /// </list> /// <b>Update History</b> /// <list type="table"> /// <listheader> /// <term>Date</term> /// <description>Description</description> /// </listheader> /// <item> /// <term>15/09/04</term> /// <description>First release. A lot more needs to be done.</description> /// </item> /// </list> /// </remarks> /// <param name="content">The text that must be parsed for links. It is passed by /// reference in order to reduce memory consumption.</param> /// <param name="contentUrl">The Url from which the content comes.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl) { ArrayList links = new ArrayList(); // It is important to notice that if the FlagFetchRobots of the contentUrl is // true then the TextParser must remember this value because during the Robots // Filtering it will become false so as not to download the robots.txt file // every time a Url must be filtered. //bool FlagFetchRobots = contentUrl.FlagFetchRobots; try { //make sure only one thread will parse contents at a time. //mutex.WaitOne(); if(contentUrl.FlagDomain!=DomainFlagValue.MustVisit) { contentUrl.FlagDomain = ExtractDomainFlag(ref content); if (contentUrl.FlagDomain != DomainFlagValue.MustVisit) if (InternetUtils.HostName(contentUrl).Contains("ebay.com")) contentUrl.FlagDomain = DomainFlagValue.MustVisit; } //perform the hyperlink matching MatchCollection matches = hrefRegex.Matches(content); if(matches.Count>0) { string documentUrl = contentUrl.Url; string baseUrl = BaseUrl(ref documentUrl); byte priority = 0; foreach(Match m in matches) { try { string url = m.Value.Trim(); url = NormalizeUrl(ref url, ref baseUrl); priority = CleanUrlParams(ref url); if(FilterUrl(ref url, ref documentUrl)) { InternetUrlToIndex iurl = new InternetUrlToIndex(url); iurl.Priority = priority; iurl.FlagDomain = domainFilter.FilterUrl(ref url); //[mod 24/2/05] No robots.txt checking is performed for non-greek urls if(iurl.FlagDomain == DomainFlagValue.MustVisit) { iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta); } else { iurl.FlagRobots = false; } if(!links.Contains(iurl)) { links.Add(iurl); } } } catch { if(globals.Settings.LogLevel == CWLogLevel.LogInfo) { globals.FileLog.LogInfo("TextParser failed to parse " + m.Value); } continue; } } } } catch(Exception ex) { if(globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning(ex.Message); } } finally { //mutex.ReleaseMutex(); } //contentUrl.FlagFetchRobots = FlagFetchRobots; ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); links.TrimToSize(); return links; }
/// <summary> /// Extracts links from the contents of a PDF document. /// </summary> /// <param name="content">The contents of the PDF document.</param> /// <param name="contentUrl">The url of the PDF document.</param> /// <returns> /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for /// each link found in the content. /// </returns> /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception> public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl) { ArrayList links = null; if ((content == null) || (content.Length == 0)) { throw new ArgumentNullException("content", "The input buffer cannot be empty or null."); } try { mutex.WaitOne(); string FileName = globals.AppWorkPath + Guid.NewGuid().ToString(); string pdfFileName = FileName + ".pdf"; string txtFileName = FileName + ".txt"; FileStream pdf = null; StreamReader txt = null; try { //store the pdf file pdf = new FileStream(pdfFileName, FileMode.Create); pdf.Write(content, 0, content.Length); pdf.Close(); pdf = null; bool success = false; //convert it to text try { converter.loadFile(pdfFileName); converter.convertToTextFile(1, converter.numPages, txtFileName); success = true; } catch { success = false; } finally { converter.closeFile(); } if (success) { txt = new StreamReader(txtFileName, encoding); string text = txt.ReadToEnd(); txt.Close(); txt = null; links = parser.ExtractLinks(ref text, ref contentUrl); } else { txt = null; } } catch (Exception ex) { if (pdf != null) { try { pdf.Close(); } catch {} } if (txt != null) { try { txt.Close(); } catch {} } if (globals.Settings.LogLevel <= CWLogLevel.LogInfo) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString()); } } finally { File.Delete(pdfFileName); File.Delete(txtFileName); } } catch { if (globals.Settings.LogLevel <= CWLogLevel.LogWarning) { globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url); } } finally { GC.Collect(); mutex.ReleaseMutex(); } ParserEventArgs e = new ParserEventArgs(contentUrl.Url); OnExtractLinksComplete(e); return(links); }