Exemplo n.º 1
0
 /// <summary>
 /// The constructor is private so that only the class itself can create an instance.
 /// </summary>
 private PdfParser()
 {
     //Initialize the synchronization mechanism
     mutex=new Mutex();
     //Initialize the encoding
     encoding=Encoding.GetEncoding("ISO-8859-7");
     //Initialize the converters and parsers
     converter = new XpdfTextClass();
     parser = TextParser.Instance();
     //Get a reference to the global variables and application settings
     globals = Globals.Instance();
 }
Exemplo n.º 2
0
 /// <summary>
 /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its
 /// properties with the default values. There should be only one instance of Crawler
 /// </summary>
 public Crawler()
 {
     //first of all get a reference to the global variables because they are needed
     //in order to initialize some variables.
     globals = Globals.Instance();
     mustStop = false;
     stopping = false;
     state = CrawlerState.Stopped;
     stats = new long[10] {0,0,0,0,0,0,0,0,0,0};
     numThreads = (int)globals.Settings.ConnectionSpeed;
     runningThreads = 0;
     sendResultsThread = null;
     synchronizeThread = null;
     crawlingThreads = null;
     syncBackOff = new ExponentialBackoff(BackoffSpeed.Declining);
     downloadBackOff = new ExponentialBackoff(BackoffSpeed.Fast);
     urlsToCrawl = new Queue();
     resultFileNames = new Queue();
     crawledUrls = new ArrayList();
     queueSize = 0;
     dataFileName = String.Empty;
     defaultEncoding = Encoding.GetEncoding("ISO-8859-7");
     htmlParser = HtmlParser.Instance();
     textParser = TextParser.Instance();
     pdfParser = PdfParser.Instance();
     swfParser = SwfParser.Instance();
     robotsFilter = RobotsFilter.Instance();
     domainFilter = DomainFilter.Instance();
     hostRequestFilter =HostRequestFilter.Instance();
     hostBanFilter = HostBanFilter.Instance();
     proxy = WebServiceProxy.Instance();
 }
Exemplo n.º 3
0
 /// <summary>
 /// Constructs a new istance of the <see cref="Crawler"/> class and initializes its
 /// properties with the default values. The constructor is private so that only the
 /// class itself can create an instance.
 /// </summary>
 private Crawler()
 {
     //first of all get a reference to the global variables because they are needed
     //in order to initialize some variables.
     globals = Globals.Instance();
     mustStop = false;
     stopping = false;
     state = CrawlerState.Stopped;
     stats = new long[10] {0,0,0,0,0,0,0,0,0,0};
     numThreads = (int)globals.Settings.ConnectionSpeed;
     runningThreads = 0;
     //sendResultsThread = null;
     synchronizeThread = null;
     crawlingThreads = null;
     syncBackOff = new Backoff(BackoffSpeed.Declining, 30000);
     downloadBackOff = new Backoff(BackoffSpeed.Fast);
     urlsToCrawl = new Queue();
     resultFileNames = new Queue();
     crawledUrls = new ArrayList();
     queueSize = 0;
     dataFileName = String.Empty;
     defaultEncoding = Encoding.GetEncoding("ISO-8859-7");
     defaultGreekEncoding = Encoding.GetEncoding(1253);
     contentRegex = new Regex("<meta\\s*http-equiv=([^>])*charset\\s*=\\s*([^>])*(utf-7|utf-8|utf-16|windows-1253)([^>])*>",RegexOptions.CultureInvariant|RegexOptions.Multiline|RegexOptions.IgnoreCase|RegexOptions.Compiled);
     htmlParser = HtmlParser.Instance();
     textParser = TextParser.Instance();
     pdfParser = PdfParser.Instance();
     swfParser = SwfParser.Instance();
     nullParser = NullParser.Instance();
     robotsFilter = RobotsFilter.Instance();
     robotsFilter.LoadEntries();
     domainFilter = DomainFilter.Instance();
     hostRequestFilter = HostRequestFilter.Instance();
     hostBanFilter = HostBanFilter.Instance();
     //proxy = WebServiceProxy.Instance();
     proxy = CrawlWaveServerProxy.Instance(globals);
 }
Exemplo n.º 4
0
 /// <summary>
 /// Provides a global access point for the single instance of the <see cref="TextParser"/>
 /// class.
 /// </summary>
 /// <returns>A reference to the single instance of <see cref="TextParser"/></returns>
 public static TextParser Instance()
 {
     if (instance==null)
     {
         //Make sure the call is thread-safe. We cannot use the private mutex since
         //it hasn't yet been initialized - it gets initialized in the constructor.
         Mutex imutex=new Mutex();
         imutex.WaitOne();
         if( instance == null )
         {
             instance = new TextParser();
         }
         imutex.Close();
     }
     return instance;
 }