private HtmlParser parser; //This will be used to extract links after the conversion #endregion Fields #region Constructors /// <summary> /// The constructor is private so that only the class itself can create an instance. /// </summary> private SwfParser() { //Initialize the synchronization mechanism mutex=new Mutex(); //Initialize the Encoding encoding = Encoding.UTF8;//GetEncoding("ISO-8859-7"); //Initialize the converters and parsers converter = new CSwf2HtmlConverterClass(); parser = HtmlParser.Instance(); //Get a reference to the global variables and application settings globals = Globals.Instance(); }
/// <summary> /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its /// properties with the default values. There should be only one instance of Crawler /// </summary> public Crawler() { //first of all get a reference to the global variables because they are needed //in order to initialize some variables. globals = Globals.Instance(); mustStop = false; stopping = false; state = CrawlerState.Stopped; stats = new long[10] {0,0,0,0,0,0,0,0,0,0}; numThreads = (int)globals.Settings.ConnectionSpeed; runningThreads = 0; sendResultsThread = null; synchronizeThread = null; crawlingThreads = null; syncBackOff = new ExponentialBackoff(BackoffSpeed.Declining); downloadBackOff = new ExponentialBackoff(BackoffSpeed.Fast); urlsToCrawl = new Queue(); resultFileNames = new Queue(); crawledUrls = new ArrayList(); queueSize = 0; dataFileName = String.Empty; defaultEncoding = Encoding.GetEncoding("ISO-8859-7"); htmlParser = HtmlParser.Instance(); textParser = TextParser.Instance(); pdfParser = PdfParser.Instance(); swfParser = SwfParser.Instance(); robotsFilter = RobotsFilter.Instance(); domainFilter = DomainFilter.Instance(); hostRequestFilter =HostRequestFilter.Instance(); hostBanFilter = HostBanFilter.Instance(); proxy = WebServiceProxy.Instance(); }
/// <summary> /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its /// properties with the default values. The constructor is private so that only the /// class itself can create an instance. /// </summary> private Crawler() { //first of all get a reference to the global variables because they are needed //in order to initialize some variables. globals = Globals.Instance(); mustStop = false; stopping = false; state = CrawlerState.Stopped; stats = new long[10] {0,0,0,0,0,0,0,0,0,0}; numThreads = (int)globals.Settings.ConnectionSpeed; runningThreads = 0; //sendResultsThread = null; synchronizeThread = null; crawlingThreads = null; syncBackOff = new Backoff(BackoffSpeed.Declining, 30000); downloadBackOff = new Backoff(BackoffSpeed.Fast); urlsToCrawl = new Queue(); resultFileNames = new Queue(); crawledUrls = new ArrayList(); queueSize = 0; dataFileName = String.Empty; defaultEncoding = Encoding.GetEncoding("ISO-8859-7"); defaultGreekEncoding = Encoding.GetEncoding(1253); contentRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(utf-7|utf-8|utf-16|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled); htmlParser = HtmlParser.Instance(); textParser = TextParser.Instance(); pdfParser = PdfParser.Instance(); swfParser = SwfParser.Instance(); nullParser = NullParser.Instance(); robotsFilter = RobotsFilter.Instance(); robotsFilter.LoadEntries(); domainFilter = DomainFilter.Instance(); hostRequestFilter = HostRequestFilter.Instance(); hostBanFilter = HostBanFilter.Instance(); //proxy = WebServiceProxy.Instance(); proxy = CrawlWaveServerProxy.Instance(globals); }
/// <summary> /// Provides a global access point for the single instance of the <see cref="HtmlParser"/> /// class. /// </summary> /// <returns>A reference to the single instance of <see cref="HtmlParser"/>.</returns> public static HtmlParser Instance() { if (instance==null) { //Make sure the call is thread-safe. We cannot use the private mutex since //it hasn't yet been initialized - it gets initialized in the constructor. Mutex imutex=new Mutex(); imutex.WaitOne(); if( instance == null ) { instance = new HtmlParser(); } imutex.Close(); } return instance; }