//The Content Type supported by the parser #endregion #region Constructor and Singleton Instance members /// <summary> /// The constructor is private so that only the class itself can create an instance. /// </summary> private TextParser() { //Initialize the synchronization mechanism mutex = new Mutex(); //Initialize the Regular Expressions hrefRegex = new Regex(@"(http|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?", RegexOptions.CultureInvariant | RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled); //use "(http|ftp|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" to enable ftp urls sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant | RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$" spacesRegex = new Regex(@"\s+", RegexOptions.CultureInvariant | RegexOptions.Multiline | RegexOptions.IgnoreCase | RegexOptions.Compiled); //Initialize the filters robotsFilter = RobotsFilter.Instance(); domainFilter = DomainFilter.Instance(); //Get a reference to the global variables and application settings globals = Globals.Instance(); }
/// <summary> /// Provides a global access point for the single instance of the <see cref="RobotsFilter"/> /// class. /// </summary> /// <returns>A reference to the single instance of <see cref="RobotsFilter"/>.</returns> public static RobotsFilter Instance() { if (instance==null) { //Make sure the call is thread-safe. We cannot use the private mutex since //it hasn't yet been initialized - it gets initialized in the constructor. Mutex imutex=new Mutex(); imutex.WaitOne(); if( instance == null ) { instance = new RobotsFilter(); } imutex.Close(); } return instance; }
/// <summary> /// Provides a global access point for the single instance of the <see cref="RobotsFilter"/> /// class. /// </summary> /// <returns>A reference to the single instance of <see cref="RobotsFilter"/>.</returns> public static RobotsFilter Instance() { if (instance == null) { //Make sure the call is thread-safe. We cannot use the private mutex since //it hasn't yet been initialized - it gets initialized in the constructor. Mutex imutex = new Mutex(); imutex.WaitOne(); if (instance == null) { instance = new RobotsFilter(); } imutex.Close(); } return(instance); }
/// <summary> /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its /// properties with the default values. There should be only one instance of Crawler /// </summary> public Crawler() { //first of all get a reference to the global variables because they are needed //in order to initialize some variables. globals = Globals.Instance(); mustStop = false; stopping = false; state = CrawlerState.Stopped; stats = new long[10] {0,0,0,0,0,0,0,0,0,0}; numThreads = (int)globals.Settings.ConnectionSpeed; runningThreads = 0; sendResultsThread = null; synchronizeThread = null; crawlingThreads = null; syncBackOff = new ExponentialBackoff(BackoffSpeed.Declining); downloadBackOff = new ExponentialBackoff(BackoffSpeed.Fast); urlsToCrawl = new Queue(); resultFileNames = new Queue(); crawledUrls = new ArrayList(); queueSize = 0; dataFileName = String.Empty; defaultEncoding = Encoding.GetEncoding("ISO-8859-7"); htmlParser = HtmlParser.Instance(); textParser = TextParser.Instance(); pdfParser = PdfParser.Instance(); swfParser = SwfParser.Instance(); robotsFilter = RobotsFilter.Instance(); domainFilter = DomainFilter.Instance(); hostRequestFilter =HostRequestFilter.Instance(); hostBanFilter = HostBanFilter.Instance(); proxy = WebServiceProxy.Instance(); }
/// <summary> /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its /// properties with the default values. The constructor is private so that only the /// class itself can create an instance. /// </summary> private Crawler() { //first of all get a reference to the global variables because they are needed //in order to initialize some variables. globals = Globals.Instance(); mustStop = false; stopping = false; state = CrawlerState.Stopped; stats = new long[10] {0,0,0,0,0,0,0,0,0,0}; numThreads = (int)globals.Settings.ConnectionSpeed; runningThreads = 0; //sendResultsThread = null; synchronizeThread = null; crawlingThreads = null; syncBackOff = new Backoff(BackoffSpeed.Declining, 30000); downloadBackOff = new Backoff(BackoffSpeed.Fast); urlsToCrawl = new Queue(); resultFileNames = new Queue(); crawledUrls = new ArrayList(); queueSize = 0; dataFileName = String.Empty; defaultEncoding = Encoding.GetEncoding("ISO-8859-7"); defaultGreekEncoding = Encoding.GetEncoding(1253); contentRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(utf-7|utf-8|utf-16|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); htmlParser = HtmlParser.Instance(); textParser = TextParser.Instance(); pdfParser = PdfParser.Instance(); swfParser = SwfParser.Instance(); nullParser = NullParser.Instance(); robotsFilter = RobotsFilter.Instance(); robotsFilter.LoadEntries(); domainFilter = DomainFilter.Instance(); hostRequestFilter = HostRequestFilter.Instance(); hostBanFilter = HostBanFilter.Instance(); //proxy = WebServiceProxy.Instance(); proxy = CrawlWaveServerProxy.Instance(globals); }
private Regex spacesRegex; //Regular Expression for compacting white space characters #endregion Fields #region Constructors /// <summary> /// The constructor is private so that only the class itself can create an instance. /// </summary> private TextParser() { //Initialize the synchronization mechanism mutex=new Mutex(); //Initialize the Regular Expressions hrefRegex=new Regex(@"(http|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //use "(http|ftp|https)://[\w]+(\.[\w]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?" to enable ftp urls sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$" spacesRegex = new Regex(@"\s+",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //Initialize the filters robotsFilter = RobotsFilter.Instance(); domainFilter = DomainFilter.Instance(); //Get a reference to the global variables and application settings globals = Globals.Instance(); }
private Regex stylesRegex; //Regular Expression for stylesheets #endregion Fields #region Constructors /// <summary> /// The constructor is private so that only the class itself can create an instance. /// </summary> private HtmlParser() { //Initialize the synchronization mechanism mutex=new Mutex(); //Initialize the Regular Expressions ahrefRegex = new Regex("href\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); baseRegex = new Regex("base\\s*href=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); charsetRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(iso-8859-7|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); frameRegex = new Regex("frame\\s*.*src\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); flashRegex = new Regex("<embed\\s*([^>])*src\\s*=([^>])*type\\s*=([^>])*application/x-shockwave-flash([^>])*>", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); refreshRegex = new Regex("<meta\\s*http-equiv=([^>])*refresh([^>])*content\\s*=\\s*\"[^>]*\">", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); robotRegex = new Regex("<meta\\s*name\\s*=\\s*\"robots\"\\s*content\\s*=\\s*\"[^>]*\">", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); scriptRegex = new Regex(@"(?i)<script([^>])*>(\w|\W)*</script([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"(?i)<script([^>])*>(\w|\W)*</script([^>])*>" or @"<script[^>]*>(\w|\W)*?</script[^>]*>" spacesRegex = new Regex(@"\s+",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); stylesRegex = new Regex(@"<style([^>])*>(\w|\W)*</style([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); stripTagRegex = new Regex("<[^>]*>", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);//<[^>]+> or >(?:(?<t>[^<]*)) sessionIDRegex = new Regex(@"([0-9a-fA-F]{40,64})|([\{|\(]?[0-9a-fA-F]{8}[-]?([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}[\)|\}]?)$", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //@"^([0-9a-f]{32})|(\{?[0-9a-f]{8}-([0-9a-f]{4}-){3}-[0-9a-f]{12}\}?)$" inlinedSessionIDRegex = new Regex(@"/(%28|\{)?(([0-9a-fA-F]{8}[-]?(([0-9a-fA-F]{4}[-]?){3}[0-9a-fA-F]{12}))|([0-9a-fA-F]{12,64}))(%29|\})?/", RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); //Initialize the filters robotsFilter = RobotsFilter.Instance(); domainFilter = DomainFilter.Instance(); //Initialize the culture info to Greek (ISO) culture = CultureInfo.CreateSpecificCulture("el-GR"); //Get a reference to the global variables and application settings globals = Globals.Instance(); }