Example #1
0
        //The Content Type supported by the parser

        #endregion

        #region Constructor and Singleton Instance members

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private TextParser()
        {
            //Initialize the synchronization mechanism
            mutex = new Mutex();
            //Initialize the Regular Expressions
            hrefRegex = new Regex(@"(http|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?", RegexOptions.CultureInvariant | RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
            //use "(http|ftp|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" to enable ftp urls
            sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant | RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);       //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$"
            spacesRegex    = new Regex(@"\s+", RegexOptions.CultureInvariant | RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled);
            //Initialize the filters
            robotsFilter = RobotsFilter.Instance();
            domainFilter = DomainFilter.Instance();
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }
Example #2
0
 /// <summary>
 /// Provides a global access point for the single instance of the <see cref="RobotsFilter"/>
 /// class.
 /// </summary>
 /// <returns>A reference to the single instance of <see cref="RobotsFilter"/>.</returns>
 public static RobotsFilter Instance()
 {
     if (instance==null)
     {
         //Make sure the call is thread-safe. We cannot use the private mutex since
         //it hasn't yet been initialized - it gets initialized in the constructor.
         Mutex imutex=new Mutex();
         imutex.WaitOne();
         if( instance == null )
         {
             instance = new RobotsFilter();
         }
         imutex.Close();
     }
     return instance;
 }
Example #3
0
 /// <summary>
 /// Provides a global access point for the single instance of the <see cref="RobotsFilter"/>
 /// class.
 /// </summary>
 /// <returns>A reference to the single instance of <see cref="RobotsFilter"/>.</returns>
 public static RobotsFilter Instance()
 {
     if (instance == null)
     {
         //Make sure the call is thread-safe. We cannot use the private mutex since
         //it hasn't yet been initialized - it gets initialized in the constructor.
         Mutex imutex = new Mutex();
         imutex.WaitOne();
         if (instance == null)
         {
             instance = new RobotsFilter();
         }
         imutex.Close();
     }
     return(instance);
 }
Example #4
0
 /// <summary>
 /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its
 /// properties with the default values. There should be only one instance of Crawler
 /// </summary>
 public Crawler()
 {
     //first of all get a reference to the global variables because they are needed
     //in order to initialize some variables.
     globals = Globals.Instance();
     mustStop = false;
     stopping = false;
     state = CrawlerState.Stopped;
     stats = new long[10] {0,0,0,0,0,0,0,0,0,0};
     numThreads = (int)globals.Settings.ConnectionSpeed;
     runningThreads = 0;
     sendResultsThread = null;
     synchronizeThread = null;
     crawlingThreads = null;
     syncBackOff = new ExponentialBackoff(BackoffSpeed.Declining);
     downloadBackOff = new ExponentialBackoff(BackoffSpeed.Fast);
     urlsToCrawl = new Queue();
     resultFileNames = new Queue();
     crawledUrls = new ArrayList();
     queueSize = 0;
     dataFileName = String.Empty;
     defaultEncoding = Encoding.GetEncoding("ISO-8859-7");
     htmlParser = HtmlParser.Instance();
     textParser = TextParser.Instance();
     pdfParser = PdfParser.Instance();
     swfParser = SwfParser.Instance();
     robotsFilter = RobotsFilter.Instance();
     domainFilter = DomainFilter.Instance();
     hostRequestFilter =HostRequestFilter.Instance();
     hostBanFilter = HostBanFilter.Instance();
     proxy = WebServiceProxy.Instance();
 }
Example #5
0
 /// <summary>
 /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its
 /// properties with the default values. The constructor is private so that only the
 /// class itself can create an instance.
 /// </summary>
 private Crawler()
 {
     //first of all get a reference to the global variables because they are needed
     //in order to initialize some variables.
     globals = Globals.Instance();
     mustStop = false;
     stopping = false;
     state = CrawlerState.Stopped;
     stats = new long[10] {0,0,0,0,0,0,0,0,0,0};
     numThreads = (int)globals.Settings.ConnectionSpeed;
     runningThreads = 0;
     //sendResultsThread = null;
     synchronizeThread = null;
     crawlingThreads = null;
     syncBackOff = new Backoff(BackoffSpeed.Declining, 30000);
     downloadBackOff = new Backoff(BackoffSpeed.Fast);
     urlsToCrawl = new Queue();
     resultFileNames = new Queue();
     crawledUrls = new ArrayList();
     queueSize = 0;
     dataFileName = String.Empty;
     defaultEncoding = Encoding.GetEncoding("ISO-8859-7");
     defaultGreekEncoding = Encoding.GetEncoding(1253);
     contentRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(utf-7|utf-8|utf-16|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
     htmlParser = HtmlParser.Instance();
     textParser = TextParser.Instance();
     pdfParser = PdfParser.Instance();
     swfParser = SwfParser.Instance();
     nullParser = NullParser.Instance();
     robotsFilter = RobotsFilter.Instance();
     robotsFilter.LoadEntries();
     domainFilter = DomainFilter.Instance();
     hostRequestFilter = HostRequestFilter.Instance();
     hostBanFilter = HostBanFilter.Instance();
     //proxy = WebServiceProxy.Instance();
     proxy = CrawlWaveServerProxy.Instance(globals);
 }
Example #6
0
        private Regex spacesRegex; //Regular Expression for compacting white space characters

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private TextParser()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the Regular Expressions
            hrefRegex=new Regex(@"(http|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //use "(http|ftp|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" to enable ftp urls
            sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$"
            spacesRegex = new Regex(@"\s+",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //Initialize the filters
            robotsFilter = RobotsFilter.Instance();
            domainFilter = DomainFilter.Instance();
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }
Example #7
0
        private Regex stylesRegex; //Regular Expression for stylesheets

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private HtmlParser()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the Regular Expressions
            ahrefRegex = new Regex("href\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            baseRegex = new Regex("base\\s*href=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            charsetRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(iso-8859-7|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            frameRegex = new Regex("frame\\s*.*src\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            flashRegex = new Regex("<embed\\s*([^>])*src\\s*=([^>])*type\\s*=([^>])*application/x-shockwave-flash([^>])*>",  RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            refreshRegex = new Regex("<meta\\s*http-equiv=([^>])*refresh([^>])*content\\s*=\\s*\"[^>]*\">", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            robotRegex = new Regex("<meta\\s*name\\s*=\\s*\"robots\"\\s*content\\s*=\\s*\"[^>]*\">", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            scriptRegex = new Regex(@"(?i)<script([^>])*>(\w|\W)*</script([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"(?i)<script([^>])*>(\w|\W)*</script([^>])*>" or @"<script[^>]*>(\w|\W)*?</script[^>]*>"
            spacesRegex = new Regex(@"\s+",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            stylesRegex = new Regex(@"<style([^>])*>(\w|\W)*</style([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            stripTagRegex = new Regex("<[^>]*>", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);//<[^>]+> or   >(?:(?<t>[^<]*))
            sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$"
            inlinedSessionIDRegex = new Regex(@"/(%28|\{)?(([0-9a-fA-F]{8}[-]?(([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}))|([0-9a-fA-F]{12,64}))(%29|\})?/", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //Initialize the filters
            robotsFilter = RobotsFilter.Instance();
            domainFilter = DomainFilter.Instance();
            //Initialize the culture info to Greek (ISO)
            culture = CultureInfo.CreateSpecificCulture("el-GR");
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }