Beispiel #1
0
 /// <summary>
 /// Creates a new instance of the <see cref="Controller"/> class.
 /// </summary>
 public Controller()
 {
     globals = Globals.Instance();
     log = new QueueEventLogger(100);
     crawler = null;
     stats = new long[10];
     proxy = CrawlWaveServerProxy.Instance(globals);
 }
        private Mutex mutex; //Mutex supporting safe access from multiple threads

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private HostRequestFilter()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the storage for the HostRequestFilterEntry objects
            hostTable = new Dictionary<string, HostRequestFilterEntry>(128);
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }
Beispiel #3
0
 /// <summary>
 /// The constructor is private so that only the class itself can create an instance.
 /// </summary>
 private HostBanFilter()
 {
     //Initialize the storage for the banned host entries
     hostTable = new Hashtable();
     //Get a reference to the global variables and application settings
     globals = Globals.Instance();
     //Initialize the list of banned hosts
     //proxy = WebServiceProxy.Instance();
     proxy = CrawlWaveServerProxy.Instance(globals);
     InitializeBannedHosts();
 }
Beispiel #4
0
 /// <summary>
 /// The constructor is private so that only the class itself can create an instance.
 /// </summary>
 private PdfParser()
 {
     //Initialize the synchronization mechanism
     mutex=new Mutex();
     //Initialize the encoding
     encoding=Encoding.GetEncoding("ISO-8859-7");
     //Initialize the converters and parsers
     converter = new XpdfTextClass();
     parser = TextParser.Instance();
     //Get a reference to the global variables and application settings
     globals = Globals.Instance();
 }
Beispiel #5
0
        private HtmlParser parser; //This will be used to extract links after the conversion

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private SwfParser()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the Encoding
            encoding = Encoding.UTF8;//GetEncoding("ISO-8859-7");
            //Initialize the converters and parsers
            converter = new CSwf2HtmlConverterClass();
            parser = HtmlParser.Instance();
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }
Beispiel #6
0
        private Mutex mutex; //Mutex supporting safe access from multiple threads

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private DomainFilter()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the storage for the IP Addresses
            ipTable = new IPCountryTable(16); //keyLength of 16 will create 65536 root nodes
            //Initialize the various strings. Interning them saves us a little memory.
            FileNames = new string [] {String.Intern("apnic.latest"), String.Intern("arin.latest"), String.Intern("lacnic.latest"), String.Intern("ripencc.latest"),};
            //initialize the regular expression
            ipAddressRegex = new Regex(@"^(?:(?:25[0-5]|2[0-4]\d|[01]\d\d|\d?\d)(?(\.?\d)\.)){4}$",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //For IPv6 addresses the following pattern can be used:
            // ^(([\dA-Fa-f]{1,4}:){7}[\dA-Fa-f]{1,4})(:([\d]{1,3}.){3}[\d]{1,3})?$
            // and the input length must be between 16 and 39 characters
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
            //Load the IP Address tables into the storage
            LoadIPAddresses();
        }
Beispiel #7
0
 /// <summary>
 /// The constructor is private so that only the class itself can create an instance.
 /// </summary>
 private Client()
 {
     globals = Globals.Instance();
 }
Beispiel #8
0
 /// <summary>
 /// Provides a global access point for the single instance of the <see cref="Globals"/>
 /// class.
 /// </summary>
 /// <returns>A reference to the single instance of <see cref="Globals"/>.</returns>
 public static Globals Instance()
 {
     if (instance==null)
     {
         //Make sure the call is thread-safe. We cannot use the private mutex since
         //it hasn't yet been initialized - it gets initialized in the constructor.
         Mutex imutex=new Mutex();
         imutex.WaitOne();
         if( instance == null )
         {
             instance = new Globals();
         }
         imutex.Close();
     }
     return instance;
 }
Beispiel #9
0
        private Regex spacesRegex; //Regular Expression for compacting white space characters

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private TextParser()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the Regular Expressions
            hrefRegex=new Regex(@"(http|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //use "(http|ftp|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" to enable ftp urls
            sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$"
            spacesRegex = new Regex(@"\s+",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //Initialize the filters
            robotsFilter = RobotsFilter.Instance();
            domainFilter = DomainFilter.Instance();
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }
Beispiel #10
0
        private Regex stylesRegex; //Regular Expression for stylesheets

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private HtmlParser()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the Regular Expressions
            ahrefRegex = new Regex("href\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            baseRegex = new Regex("base\\s*href=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            charsetRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(iso-8859-7|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            frameRegex = new Regex("frame\\s*.*src\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            flashRegex = new Regex("<embed\\s*([^>])*src\\s*=([^>])*type\\s*=([^>])*application/x-shockwave-flash([^>])*>",  RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            refreshRegex = new Regex("<meta\\s*http-equiv=([^>])*refresh([^>])*content\\s*=\\s*\"[^>]*\">", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            robotRegex = new Regex("<meta\\s*name\\s*=\\s*\"robots\"\\s*content\\s*=\\s*\"[^>]*\">", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            scriptRegex = new Regex(@"(?i)<script([^>])*>(\w|\W)*</script([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"(?i)<script([^>])*>(\w|\W)*</script([^>])*>" or @"<script[^>]*>(\w|\W)*?</script[^>]*>"
            spacesRegex = new Regex(@"\s+",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            stylesRegex = new Regex(@"<style([^>])*>(\w|\W)*</style([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            stripTagRegex = new Regex("<[^>]*>", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);//<[^>]+> or   >(?:(?<t>[^<]*))
            sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$"
            inlinedSessionIDRegex = new Regex(@"/(%28|\{)?(([0-9a-fA-F]{8}[-]?(([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}))|([0-9a-fA-F]{12,64}))(%29|\})?/", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
            //Initialize the filters
            robotsFilter = RobotsFilter.Instance();
            domainFilter = DomainFilter.Instance();
            //Initialize the culture info to Greek (ISO)
            culture = CultureInfo.CreateSpecificCulture("el-GR");
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }