Exemple #1
0
        private HtmlParser parser; //This will be used to extract links after the conversion

        #endregion Fields

        #region Constructors

        /// <summary>
        /// The constructor is private so that only the class itself can create an instance.
        /// </summary>
        private SwfParser()
        {
            //Initialize the synchronization mechanism
            mutex=new Mutex();
            //Initialize the Encoding
            encoding = Encoding.UTF8;//GetEncoding("ISO-8859-7");
            //Initialize the converters and parsers
            converter = new CSwf2HtmlConverterClass();
            parser = HtmlParser.Instance();
            //Get a reference to the global variables and application settings
            globals = Globals.Instance();
        }
Exemple #2
0
 /// <summary>
 /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its
 /// properties with the default values. There should be only one instance of Crawler
 /// </summary>
 public Crawler()
 {
     //first of all get a reference to the global variables because they are needed
     //in order to initialize some variables.
     globals = Globals.Instance();
     mustStop = false;
     stopping = false;
     state = CrawlerState.Stopped;
     stats = new long[10] {0,0,0,0,0,0,0,0,0,0};
     numThreads = (int)globals.Settings.ConnectionSpeed;
     runningThreads = 0;
     sendResultsThread = null;
     synchronizeThread = null;
     crawlingThreads = null;
     syncBackOff = new ExponentialBackoff(BackoffSpeed.Declining);
     downloadBackOff = new ExponentialBackoff(BackoffSpeed.Fast);
     urlsToCrawl = new Queue();
     resultFileNames = new Queue();
     crawledUrls = new ArrayList();
     queueSize = 0;
     dataFileName = String.Empty;
     defaultEncoding = Encoding.GetEncoding("ISO-8859-7");
     htmlParser = HtmlParser.Instance();
     textParser = TextParser.Instance();
     pdfParser = PdfParser.Instance();
     swfParser = SwfParser.Instance();
     robotsFilter = RobotsFilter.Instance();
     domainFilter = DomainFilter.Instance();
     hostRequestFilter =HostRequestFilter.Instance();
     hostBanFilter = HostBanFilter.Instance();
     proxy = WebServiceProxy.Instance();
 }
Exemple #3
0
 /// <summary>
 /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its
 /// properties with the default values. The constructor is private so that only the
 /// class itself can create an instance.
 /// </summary>
 private Crawler()
 {
     //first of all get a reference to the global variables because they are needed
     //in order to initialize some variables.
     globals = Globals.Instance();
     mustStop = false;
     stopping = false;
     state = CrawlerState.Stopped;
     stats = new long[10] {0,0,0,0,0,0,0,0,0,0};
     numThreads = (int)globals.Settings.ConnectionSpeed;
     runningThreads = 0;
     //sendResultsThread = null;
     synchronizeThread = null;
     crawlingThreads = null;
     syncBackOff = new Backoff(BackoffSpeed.Declining, 30000);
     downloadBackOff = new Backoff(BackoffSpeed.Fast);
     urlsToCrawl = new Queue();
     resultFileNames = new Queue();
     crawledUrls = new ArrayList();
     queueSize = 0;
     dataFileName = String.Empty;
     defaultEncoding = Encoding.GetEncoding("ISO-8859-7");
     defaultGreekEncoding = Encoding.GetEncoding(1253);
     contentRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(utf-7|utf-8|utf-16|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
     htmlParser = HtmlParser.Instance();
     textParser = TextParser.Instance();
     pdfParser = PdfParser.Instance();
     swfParser = SwfParser.Instance();
     nullParser = NullParser.Instance();
     robotsFilter = RobotsFilter.Instance();
     robotsFilter.LoadEntries();
     domainFilter = DomainFilter.Instance();
     hostRequestFilter = HostRequestFilter.Instance();
     hostBanFilter = HostBanFilter.Instance();
     //proxy = WebServiceProxy.Instance();
     proxy = CrawlWaveServerProxy.Instance(globals);
 }
Exemple #4
0
 /// <summary>
 /// Provides a global access point for the single instance of the <see cref="HtmlParser"/>
 /// class.
 /// </summary>
 /// <returns>A reference to the single instance of <see cref="HtmlParser"/>.</returns>
 public static HtmlParser Instance()
 {
     if (instance==null)
     {
         //Make sure the call is thread-safe. We cannot use the private mutex since
         //it hasn't yet been initialized - it gets initialized in the constructor.
         Mutex imutex=new Mutex();
         imutex.WaitOne();
         if( instance == null )
         {
             instance = new HtmlParser();
         }
         imutex.Close();
     }
     return instance;
 }