예제 #1
0
 /// <summary>
 /// Raises an ExtractContentComplete event when the extraction of content is complete
 /// </summary>
 /// <param name="e">The <see cref="ParserEventArgs"/> related to the event.</param>
 private void OnExtractContentComplete(ParserEventArgs e)
 {
     if (ExtractContentComplete != null)
     {
         ExtractContentComplete(this, e);
     }
 }
예제 #2
0
        /// <summary>
        /// Extracts links from the contents of a document.
        /// </summary>
        /// <param name="content">The contents of the document.</param>
        /// <param name="contentUrl">The url of the document.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks>
        public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList       links = new ArrayList();
            ParserEventArgs e     = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            return(links);
        }
예제 #3
0
        /// <summary>
        /// Performs the extraction of content from a document.
        /// </summary>
        /// <param name="content">
        /// The contents of the document from which the content must be extracted.
        /// </param>
        /// <param name="Flag">The parameter is not used in this method.</param>
        /// <returns>A string containing the desired extracted content.</returns>
        /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks>
        public override string ExtractContent(byte[] content, bool Flag)
        {
            string          retVal = String.Empty;
            ParserEventArgs e      = new ParserEventArgs(String.Empty);

            OnExtractContentComplete(e);
            return(retVal);
        }
예제 #4
0
        /// <summary>
        /// Extracts text from the contents of a document.
        /// </summary>
        /// <param name="content">The contents of the document.</param>
        /// <returns>The text extracted from the document.</returns>
        /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks>
        public override string ExtractText(ref string content)
        {
            string          retVal = String.Empty;
            ParserEventArgs e      = new ParserEventArgs(String.Empty);

            OnExtractTextComplete(e);
            return(retVal);
        }
예제 #5
0
        /// <summary>
        /// Performs the extraction of text from a text document. The text is extracted by
        /// compacting consecutive white space characters.
        /// </summary>
        /// <param name="content">
        /// The contents of the document from which the text must be extracted. Passes by
        /// reference in order to reduce memory consumption.
        /// </param>
        /// <returns>A string containing the 'clean' text extracted from the document.</returns>
        public override string ExtractText(ref string content)
        {
            string        retVal = String.Empty;
            StringBuilder sb     = new StringBuilder(content);

            sb.Replace("\r\n", " ");
            sb.Replace('\n', ' ');
            sb.Replace('\t', ' ');
            retVal = spacesRegex.Replace(sb.ToString(), " ");
            ParserEventArgs e = new ParserEventArgs(String.Empty);

            OnExtractTextComplete(e);
            return(retVal);
        }
예제 #6
0
 private void crawler_UrlProcessed(object sender, ParserEventArgs e)
 {
     log.LogInfo("Crawler processed url " + e.Url);
 }
예제 #7
0
 /// <summary>
 /// Extracts the desired contents of a document.
 /// </summary>
 /// <param name="content">The contents of the document.</param>
 /// <param name="Flag">The parameter is not used in this method.</param>
 /// <returns>The contents extracted from the document.</returns>
 /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks>
 public override string ExtractContent(ref string content, bool Flag)
 {
     string retVal = String.Empty;
     ParserEventArgs e = new ParserEventArgs(String.Empty);
     OnExtractContentComplete(e);
     return retVal;
 }
예제 #8
0
 /// <summary>
 /// Extracts text from the contents of a document.
 /// </summary>
 /// <param name="content">The contents of the document.</param>
 /// <returns>The text extracted from the document.</returns>
 /// <remarks>This method <b>ALWAYS</b> returns an empty string.</remarks>
 public override string ExtractText(byte[] content)
 {
     string retVal = String.Empty;
     ParserEventArgs e = new ParserEventArgs(String.Empty);
     OnExtractTextComplete(e);
     return retVal;
 }
예제 #9
0
 /// <summary>
 /// Extracts links from the contents of a document.
 /// </summary>
 /// <param name="content">The contents of the document.</param>
 /// <param name="contentUrl">The url of the document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <remarks>This method <b>ALWAYS</b> returns an empty ArrayList.</remarks>
 public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
 {
     ArrayList links=new ArrayList();
     ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
     OnExtractLinksComplete(e);
     return links;
 }
예제 #10
0
 /// <summary>
 /// Extracts text from the contents of a SWF document.
 /// </summary>
 /// <param name="content">The contents of the SWF document.</param>
 /// <returns>The text extracted from the SWF document.</returns>
 /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
 /// <remarks>
 /// </remarks>
 public override string ExtractText(byte[] content)
 {
     string retVal = String.Empty;
     if((content==null)||(content.Length==0))
     {
         throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
     }
     try
     {
         mutex.WaitOne();
         string FileName = globals.AppWorkPath + Guid.NewGuid().ToString();
         string swfFileName = FileName + ".swf";
         string htmFileName = FileName + ".htm";
         FileStream swf = null;
         StreamReader htm = null;
         try
         {
             //store the swf file
             swf = new FileStream(swfFileName,FileMode.Create);
             swf.Write(content, 0, content.Length);
             swf.Close();
             swf = null;
             //convert it to html
             bool success = converter.ConvertSwfFile(swfFileName, htmFileName);
             if(success)
             {
                 htm = new StreamReader(htmFileName, encoding);
                 string html = htm.ReadToEnd();
                 htm.Close();
                 htm = null;
                 retVal = parser.ExtractText(ref html);
             }
         }
         catch(Exception ex)
         {
             if(swf!=null)
             {
                 try
                 {
                     swf.Close();
                 }
                 catch
                 {}
             }
             if(htm!=null)
             {
                 try
                 {
                     htm.Close();
                 }
                 catch
                 {}
             }
             if(globals.Settings.LogLevel <= CWLogLevel.LogInfo)
             {
                 globals.FileLog.LogWarning("SwfParser failed to extract text: " + ex.ToString());
             }
         }
         finally
         {
             File.Delete(swfFileName);
             File.Delete(htmFileName);
         }
     }
     catch(Exception ex)
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("SwfParser failed to extract text: " + ex.Message);
         }
     }
     finally
     {
         GC.Collect();
         mutex.ReleaseMutex();
     }
     ParserEventArgs e = new ParserEventArgs(String.Empty);
     OnExtractTextComplete(e);
     return retVal;
 }
예제 #11
0
 /// <summary>
 /// Performs the extraction of text from a text document. The text is extracted by
 /// compacting consecutive white space characters.
 /// </summary>
 /// <param name="content">
 /// The contents of the document from which the text must be extracted. Passes by
 /// reference in order to reduce memory consumption.
 /// </param>
 /// <returns>A string containing the 'clean' text extracted from the document.</returns>
 public override string ExtractText(ref string content)
 {
     string retVal = String.Empty;
     StringBuilder sb=new StringBuilder(content);
     sb.Replace("\r\n", " ");
     sb.Replace('\n', ' ');
     sb.Replace('\t', ' ');
     retVal = spacesRegex.Replace(sb.ToString()," ");
     ParserEventArgs e = new ParserEventArgs(String.Empty);
     OnExtractTextComplete(e);
     return retVal;
 }
예제 #12
0
        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. It is passed by
        /// reference in order to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = new ArrayList();

            // It is important to notice that if the FlagFetchRobots of the contentUrl is
            // true then the TextParser must remember this value because during the Robots
            // Filtering it will become false so as not to download the robots.txt file
            // every time a Url must be filtered.
            //bool FlagFetchRobots = contentUrl.FlagFetchRobots;
            try
            {
                //make sure only one thread will parse contents at a time.
                //mutex.WaitOne();
                if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                {
                    contentUrl.FlagDomain = ExtractDomainFlag(ref content);

                    if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                    {
                        if (InternetUtils.HostName(contentUrl).Contains("ebay.com"))
                        {
                            contentUrl.FlagDomain = DomainFlagValue.MustVisit;
                        }
                    }
                }
                //perform the hyperlink matching
                MatchCollection matches = hrefRegex.Matches(content);

                if (matches.Count > 0)
                {
                    string documentUrl = contentUrl.Url;
                    string baseUrl     = BaseUrl(ref documentUrl);
                    byte   priority    = 0;

                    foreach (Match m in matches)
                    {
                        try
                        {
                            string url = m.Value.Trim();
                            url      = NormalizeUrl(ref url, ref baseUrl);
                            priority = CleanUrlParams(ref url);
                            if (FilterUrl(ref url, ref documentUrl))
                            {
                                InternetUrlToIndex iurl = new InternetUrlToIndex(url);
                                iurl.Priority   = priority;
                                iurl.FlagDomain = domainFilter.FilterUrl(ref url);
                                //[mod 24/2/05] No robots.txt checking is performed for non-greek urls
                                if (iurl.FlagDomain == DomainFlagValue.MustVisit)
                                {
                                    iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta);
                                }
                                else
                                {
                                    iurl.FlagRobots = false;
                                }
                                if (!links.Contains(iurl))
                                {
                                    links.Add(iurl);
                                }
                            }
                        }
                        catch
                        {
                            if (globals.Settings.LogLevel == CWLogLevel.LogInfo)
                            {
                                globals.FileLog.LogInfo("TextParser failed to parse " + m.Value);
                            }
                            continue;
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning(ex.Message);
                }
            }
            finally
            {
                //mutex.ReleaseMutex();
            }
            //contentUrl.FlagFetchRobots = FlagFetchRobots;
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            links.TrimToSize();
            return(links);
        }
예제 #13
0
 /// <summary>
 /// Raises the <see cref="UrlProcessed"/> event
 /// </summary>
 /// <param name="e">The <see cref="ParserEventArgs"/> related to the event.</param>
 private void OnUrlProcessed(ParserEventArgs e)
 {
     if(UrlProcessed != null)
     {
         UrlProcessed(this, e);
     }
 }
예제 #14
0
 /// <summary>
 /// Extracts text from the contents of a PDF document.
 /// </summary>
 /// <param name="content">The contents of the PDF document.</param>
 /// <returns>The text extracted from the PDF document.</returns>
 /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
 /// <remarks>
 /// </remarks>
 public override string ExtractText(byte[] content)
 {
     string retVal = String.Empty;
     if((content==null)||(content.Length==0))
     {
         throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
     }
     try
     {
         mutex.WaitOne();
         string FileName = globals.AppWorkPath + Guid.NewGuid().ToString();
         string pdfFileName = FileName + ".pdf";
         string txtFileName = FileName + ".txt";
         FileStream pdf = null;
         StreamReader txt = null;
         try
         {
             //store the pdf file
             pdf = new FileStream(pdfFileName,FileMode.Create);
             pdf.Write(content, 0, content.Length);
             pdf.Close();
             pdf = null;
             bool success = false;
             //convert it to text
             try
             {
                 converter.loadFile(pdfFileName);
                 converter.convertToTextFile(1, converter.numPages, txtFileName);
                 success = true;
             }
             catch
             {
                 success = false;
             }
             finally
             {
                 converter.closeFile();
             }
             if(success)
             {
                 txt = new StreamReader(txtFileName, encoding);
                 string text = txt.ReadToEnd();
                 txt.Close();
                 txt = null;
                 retVal = parser.ExtractText(ref text);
             }
             else
             {
                 txt = null;
             }
         }
         catch(Exception ex)
         {
             if(pdf!=null)
             {
                 try
                 {
                     pdf.Close();
                 }
                 catch
                 {}
             }
             if(txt!=null)
             {
                 try
                 {
                     txt.Close();
                 }
                 catch
                 {}
             }
             if(globals.Settings.LogLevel <= CWLogLevel.LogInfo)
             {
                 globals.FileLog.LogWarning("PdfParser failed to extract text: " + ex.ToString());
             }
         }
         finally
         {
             File.Delete(pdfFileName);
             File.Delete(txtFileName);
         }
     }
     catch(Exception ex)
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("PdfParser failed to extract text: " + ex.Message);
         }
     }
     finally
     {
         GC.Collect();
         mutex.ReleaseMutex();
     }
     ParserEventArgs e = new ParserEventArgs(String.Empty);
     OnExtractTextComplete(e);
     return retVal;
 }
예제 #15
0
        /// <summary>
        /// Performs the extraction of content from a SWF document. Depending on the value
        /// of the Flag provided it simply returns a string containing the HTML format of
        /// the input or it returns the text format of the input after performing a white
        /// space compaction.
        /// </summary>
        /// <param name="content">
        /// The contents of the document from which the content must be extracted.
        /// </param>
        /// <param name="Flag">Determines what kind of processing will be performed on the
        /// input. If set to false it simply returns a string containing the HTML format of
        /// the input. If set to true it returns the text format of the input after performing
        /// a white space compaction.
        /// </param>
        /// <returns>A string containing the desired extracted content.</returns>
        /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
        public override string ExtractContent(byte[] content, bool Flag)
        {
            string retVal = null;

            if ((content == null) || (content.Length == 0))
            {
                throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
            }
            try
            {
                mutex.WaitOne();
                string       FileName    = globals.AppWorkPath + Guid.NewGuid().ToString();
                string       swfFileName = FileName + ".swf";
                string       htmFileName = FileName + ".htm";
                FileStream   swf         = null;
                StreamReader htm         = null;
                try
                {
                    //store the swf file
                    swf = new FileStream(swfFileName, FileMode.Create);
                    swf.Write(content, 0, content.Length);
                    swf.Close();
                    swf = null;
                    //convert it to html
                    bool success = converter.ConvertSwfFile(swfFileName, htmFileName);
                    if (success)
                    {
                        htm = new StreamReader(htmFileName, encoding);
                        string html = htm.ReadToEnd();
                        htm.Close();
                        htm = null;
                        if (!Flag)
                        {
                            retVal = html;
                        }
                        else
                        {
                            retVal = parser.ExtractText(ref html);
                        }
                    }
                }
                catch (Exception ex)
                {
                    if (swf != null)
                    {
                        try
                        {
                            swf.Close();
                        }
                        catch
                        {}
                    }
                    if (htm != null)
                    {
                        try
                        {
                            htm.Close();
                        }
                        catch
                        {}
                    }
                    if (globals.Settings.LogLevel <= CWLogLevel.LogInfo)
                    {
                        globals.FileLog.LogWarning("SwfParser failed to extract text: " + ex.ToString());
                    }
                }
                finally
                {
                    File.Delete(swfFileName);
                    File.Delete(htmFileName);
                }
            }
            catch (Exception ex)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("SwfParser failed to extract content: " + ex.Message);
                }
            }
            finally
            {
                GC.Collect();
                mutex.ReleaseMutex();
            }
            ParserEventArgs e = new ParserEventArgs(String.Empty);

            OnExtractContentComplete(e);
            return(retVal);
        }
예제 #16
0
        /// <summary>
        /// Performs the extraction of content from a PDF document. Depending on the value
        /// of the Flag provided it simply returns a string same as the text produced from
        /// the parsing of the PDF document or it removes consecutive whitespace characters
        /// in order to perform a compaction.
        ///  </summary>
        /// <param name="content">
        /// The contents of the document from which the content must be extracted.
        /// </param>
        /// <param name="Flag">Determines what kind of processing will be performed on the
        /// input. If set to false it simply returns a string same as the text produced from
        /// the parsing of the PDF document. If set to true it removes consecutive white
        /// space characters in order to perform a compaction.
        /// </param>
        /// <returns>A string containing the desired extracted content.</returns>
        /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
        public override string ExtractContent(byte[] content, bool Flag)
        {
            string retVal = String.Empty;

            if ((content == null) || (content.Length == 0))
            {
                throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
            }
            try
            {
                mutex.WaitOne();
                string       FileName    = globals.AppWorkPath + Guid.NewGuid().ToString();
                string       pdfFileName = FileName + ".pdf";
                string       txtFileName = FileName + ".txt";
                FileStream   pdf         = null;
                StreamReader txt         = null;
                try
                {
                    //store the pdf file
                    pdf = new FileStream(pdfFileName, FileMode.Create);
                    pdf.Write(content, 0, content.Length);
                    pdf.Close();
                    pdf = null;
                    //convert it to text
                    int success = converter.ConvertPdf2Text(pdfFileName, txtFileName);
                    if (success == 0)
                    {
                        txt = new StreamReader(txtFileName, encoding);
                        if (!Flag)
                        {
                            retVal = txt.ReadToEnd();
                        }
                        else
                        {
                            string text = txt.ReadToEnd();
                            retVal = parser.ExtractText(ref text);
                        }
                        txt.Close();
                        txt = null;
                    }
                }
                catch (Exception ex)
                {
                    if (pdf != null)
                    {
                        try
                        {
                            pdf.Close();
                        }
                        catch
                        {}
                    }
                    if (txt != null)
                    {
                        try
                        {
                            txt.Close();
                        }
                        catch
                        {}
                    }
                    if (globals.Settings.LogLevel <= CWLogLevel.LogInfo)
                    {
                        globals.FileLog.LogWarning("PdfParser failed to extract content: " + ex.ToString());
                    }
                }
                finally
                {
                    File.Delete(pdfFileName);
                    File.Delete(txtFileName);
                }
            }
            catch (Exception ex)
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("PdfParser failed to extract content: " + ex.Message);
                }
            }
            finally
            {
                GC.Collect();
                mutex.ReleaseMutex();
            }
            ParserEventArgs e = new ParserEventArgs(String.Empty);

            OnExtractContentComplete(e);
            return(retVal);
        }
예제 #17
0
 /// <summary>
 /// Extracts links from the contents of a PDF document.
 /// </summary>
 /// <param name="content">The contents of the PDF document.</param>
 /// <param name="contentUrl">The url of the PDF document.</param>
 /// <returns>
 /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
 /// each link found in the content.
 /// </returns>
 /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
 public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
 {
     ArrayList links=null;
     if((content==null)||(content.Length==0))
     {
         throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
     }
     try
     {
         mutex.WaitOne();
         string FileName = globals.AppWorkPath + Guid.NewGuid().ToString();
         string pdfFileName = FileName + ".pdf";
         string txtFileName = FileName + ".txt";
         FileStream pdf = null;
         StreamReader txt = null;
         try
         {
             //store the pdf file
             pdf = new FileStream(pdfFileName,FileMode.Create);
             pdf.Write(content, 0, content.Length);
             pdf.Close();
             pdf = null;
             //convert it to text
             int success = converter.ConvertPdf2Text(pdfFileName, txtFileName);
             if(success==0)
             {
                 txt = new StreamReader(txtFileName, encoding);
                 string text = txt.ReadToEnd();
                 txt.Close();
                 txt = null;
                 links = parser.ExtractLinks(ref text, ref contentUrl);
             }
             else
             {
                 txt.Close();
                 txt = null;
             }
         }
         catch(Exception ex)
         {
             if(pdf!=null)
             {
                 try
                 {
                     pdf.Close();
                 }
                 catch
                 {}
             }
             if(txt!=null)
             {
                 try
                 {
                     txt.Close();
                 }
                 catch
                 {}
             }
             if(globals.Settings.LogLevel <= CWLogLevel.LogInfo)
             {
                 globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString());
             }
         }
         finally
         {
             File.Delete(pdfFileName);
             File.Delete(txtFileName);
         }
     }
     catch
     {
         if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
         {
             globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url);
         }
     }
     finally
     {
         GC.Collect();
         mutex.ReleaseMutex();
     }
     ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
     OnExtractLinksComplete(e);
     return links;
 }
예제 #18
0
 /// <summary>
 /// Raises an ExtractTextComplete event when the extraction of text is complete
 /// </summary>
 /// <param name="e">The <see cref="ParserEventArgs"/> related to the event.</param>
 private void OnExtractTextComplete(ParserEventArgs e)
 {
     if(ExtractTextComplete!=null)
     {
         ExtractTextComplete(this, e);
     }
 }
예제 #19
0
        /// <summary>
        /// Performs the extraction of links from a text document. It can extract simple
        /// links that are separated from the rest of the text using spaces or line brakes
        /// or any other delimiters. The results are returned as an <see cref="ArrayList"/>
        /// of <see cref="InternetUrlToIndex"/> objects.
        /// </summary>
        /// <remarks>
        /// Besides the parsing and extraction of Urls, ExtractLinks also performs other 
        /// tasks as well, such as:<br/>
        /// <list type="bullet">
        ///   <item>
        ///     <description>Filtering of urls to resources of unsupported content-type, e.g. css, images, etc.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of multimple links to the same url and to the document itself.</description>
        ///   </item>
        ///   <item>
        ///     <description>Filtering of session id variables in dynamic Urls and limiting
        ///     of the number of GET variables in dynamic Urls.</description>
        ///   </item>
        ///   <item>
        ///     <description>Flagging of Urls according to their country domain.</description>
        ///   </item>
        /// </list>
        /// <b>Update History</b>
        /// <list type="table">
        ///   <listheader>
        ///		<term>Date</term>
        ///		<description>Description</description>
        ///   </listheader>
        ///   <item>
        ///     <term>15/09/04</term>
        ///     <description>First release. A lot more needs to be done.</description>
        ///   </item>
        /// </list>
        /// </remarks>
        /// <param name="content">The text that must be parsed for links. It is passed by
        /// reference in order to reduce memory consumption.</param>
        /// <param name="contentUrl">The Url from which the content comes.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        public override ArrayList ExtractLinks(ref string content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = new ArrayList();
            // It is important to notice that if the FlagFetchRobots of the contentUrl is
            // true then the TextParser must remember this value because during the Robots
            // Filtering it will become false so as not to download the robots.txt file
            // every time a Url must be filtered.
            //bool FlagFetchRobots = contentUrl.FlagFetchRobots;
            try
            {
                //make sure only one thread will parse contents at a time.
                //mutex.WaitOne();
                if(contentUrl.FlagDomain!=DomainFlagValue.MustVisit)
                {
                    contentUrl.FlagDomain = ExtractDomainFlag(ref content);

                    if (contentUrl.FlagDomain != DomainFlagValue.MustVisit)
                        if (InternetUtils.HostName(contentUrl).Contains("ebay.com"))
                            contentUrl.FlagDomain = DomainFlagValue.MustVisit;
                }
                //perform the hyperlink matching
                MatchCollection matches = hrefRegex.Matches(content);

                if(matches.Count>0)
                {
                    string documentUrl = contentUrl.Url;
                    string baseUrl = BaseUrl(ref documentUrl);
                    byte priority = 0;

                    foreach(Match m in matches)
                    {
                        try
                        {
                            string url = m.Value.Trim();
                            url = NormalizeUrl(ref url, ref baseUrl);
                            priority = CleanUrlParams(ref url);
                            if(FilterUrl(ref url, ref documentUrl))
                            {
                                InternetUrlToIndex iurl = new InternetUrlToIndex(url);
                                iurl.Priority = priority;
                                iurl.FlagDomain = domainFilter.FilterUrl(ref url);
                                //[mod 24/2/05] No robots.txt checking is performed for non-greek urls
                                if(iurl.FlagDomain == DomainFlagValue.MustVisit)
                                {
                                    iurl.FlagRobots = robotsFilter.FilterUrl(url, contentUrl, RobotsMetaTagValue.NoMeta);
                                }
                                else
                                {
                                    iurl.FlagRobots = false;
                                }
                                if(!links.Contains(iurl))
                                {
                                    links.Add(iurl);
                                }
                            }
                        }
                        catch
                        {
                            if(globals.Settings.LogLevel == CWLogLevel.LogInfo)
                            {
                                globals.FileLog.LogInfo("TextParser failed to parse " + m.Value);
                            }
                            continue;
                        }
                    }
                }
            }
            catch(Exception ex)
            {
                if(globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning(ex.Message);
                }
            }
            finally
            {
                //mutex.ReleaseMutex();
            }
            //contentUrl.FlagFetchRobots = FlagFetchRobots;
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);
            OnExtractLinksComplete(e);
            links.TrimToSize();
            return links;
        }
예제 #20
0
 private void crawler_UrlProcessed(object sender, ParserEventArgs e)
 {
     log.LogInfo("Crawler processed url " + e.Url);
 }
예제 #21
0
        /// <summary>
        /// Extracts links from the contents of a PDF document.
        /// </summary>
        /// <param name="content">The contents of the PDF document.</param>
        /// <param name="contentUrl">The url of the PDF document.</param>
        /// <returns>
        /// An <see cref="ArrayList"/> of <see cref="InternetUrlToIndex"/> objects, one for
        /// each link found in the content.
        /// </returns>
        /// <exception cref="ArgumentNullException">If the input buffer is null or empty.</exception>
        public override ArrayList ExtractLinks(byte[] content, ref InternetUrlToCrawl contentUrl)
        {
            ArrayList links = null;

            if ((content == null) || (content.Length == 0))
            {
                throw new ArgumentNullException("content", "The input buffer cannot be empty or null.");
            }
            try
            {
                mutex.WaitOne();
                string       FileName    = globals.AppWorkPath + Guid.NewGuid().ToString();
                string       pdfFileName = FileName + ".pdf";
                string       txtFileName = FileName + ".txt";
                FileStream   pdf         = null;
                StreamReader txt         = null;
                try
                {
                    //store the pdf file
                    pdf = new FileStream(pdfFileName, FileMode.Create);
                    pdf.Write(content, 0, content.Length);
                    pdf.Close();
                    pdf = null;
                    bool success = false;
                    //convert it to text
                    try
                    {
                        converter.loadFile(pdfFileName);
                        converter.convertToTextFile(1, converter.numPages, txtFileName);
                        success = true;
                    }
                    catch
                    {
                        success = false;
                    }
                    finally
                    {
                        converter.closeFile();
                    }
                    if (success)
                    {
                        txt = new StreamReader(txtFileName, encoding);
                        string text = txt.ReadToEnd();
                        txt.Close();
                        txt   = null;
                        links = parser.ExtractLinks(ref text, ref contentUrl);
                    }
                    else
                    {
                        txt = null;
                    }
                }
                catch (Exception ex)
                {
                    if (pdf != null)
                    {
                        try
                        {
                            pdf.Close();
                        }
                        catch
                        {}
                    }
                    if (txt != null)
                    {
                        try
                        {
                            txt.Close();
                        }
                        catch
                        {}
                    }
                    if (globals.Settings.LogLevel <= CWLogLevel.LogInfo)
                    {
                        globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url + ": " + ex.ToString());
                    }
                }
                finally
                {
                    File.Delete(pdfFileName);
                    File.Delete(txtFileName);
                }
            }
            catch
            {
                if (globals.Settings.LogLevel <= CWLogLevel.LogWarning)
                {
                    globals.FileLog.LogWarning("PdfParser failed to extract links from " + contentUrl.Url);
                }
            }
            finally
            {
                GC.Collect();
                mutex.ReleaseMutex();
            }
            ParserEventArgs e = new ParserEventArgs(contentUrl.Url);

            OnExtractLinksComplete(e);
            return(links);
        }