Beispiel #1
0
        /// <summary>
        /// Takes a single Uri (Url) and returns the catalog that is generated
        /// by following all the links from that point.
        /// </summary>
        /// <remarks>
        ///This is the MAIN method of the indexing system.
        /// </remarks>
        public Catalog BuildCatalog(Uri startPageUri)
        {
            _Catalog = new Catalog();
            _Catalog.clear();
            //   _Catalog = Catalog.c



            // Setup Stop, Go, Stemming
            SetPreferences();

            _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);

            // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
            // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
            // RECURSIVE CALL TO 'Process()' STARTS HERE
            ProcessUri(startPageUri, 0);

            // Now we've FINISHED Spidering
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
            ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));

            // Serialization of the Catalog, so we can load it again if the server Application is restarted
            _Catalog.Save();

            ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));

            return(_Catalog);// finished, return to the calling code to 'use'
        }
        /// <summary>
        /// [v6]
        /// </summary>
        /// <param name="startPageUri">array of start pages</param>
        /// <returns>Catalog of words/documents</returns>
        public Catalog BuildCatalog(Uri[] startPageUris)
        {
            _Catalog = new Catalog(); //_Cache = new Cache(); // [v7]
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (Uri Array) count: " + startPageUris.Length.ToString()));
            // Setup Stop, Go, Stemming
            SetPreferences();

            foreach (Uri startPageUri in startPageUris)
            {
                _CurrentStartUri       = startPageUri; // to compare against fully qualified links
                _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower();

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (start Uri) " + startPageUri.AbsoluteUri));

                _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);

                // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
                // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
                // RECURSIVE CALL TO 'Process()' STARTS HERE
                ProcessUri(startPageUri, 0);

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (end Uri) " + startPageUri.AbsoluteUri));
            }
            // Now we've FINISHED Spidering
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
            ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));

            // Serialization of the Catalog, so we can load it again if the server Application is restarted
            _Catalog.Save();
            //_Cache.Save(); //[v7]

            ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));

            return(_Catalog);// finished, return to the calling code to 'use'
        }
Beispiel #3
0
        /// <summary>
        /// [v6]
        /// </summary>
        /// <param name="startPageUri">array of start pages</param>
        /// <returns>Catalog of words/documents</returns>
        public Catalog BuildCatalog(Uri[] startPageUris)
        {
            _Catalog = new Catalog(); //_Cache = new Cache(); // [v7]
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (Uri Array) count: " + startPageUris.Length.ToString()));
            // Setup Stop, Go, Stemming
            SetPreferences();

            foreach (Uri startPageUri in startPageUris)
            {
                _CurrentStartUri = startPageUri;    // to compare against fully qualified links
                _CurrentStartUriString = _CurrentStartUri.AbsoluteUri.ToString().ToLower();

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (start Uri) " + startPageUri.AbsoluteUri));

                _Robot = new RobotsTxt(startPageUri, Preferences.RobotUserAgent);

                // GETS THE FIRST DOCUMENT, AND STARTS THE SPIDER! -- create the 'root' document to start the search
                // HtmlDocument htmldoc = new HtmlDocument(startPageUri);
                // RECURSIVE CALL TO 'Process()' STARTS HERE
                ProcessUri(startPageUri, 0);

                ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog (end Uri) " + startPageUri.AbsoluteUri));
            }
            // Now we've FINISHED Spidering
            ProgressEvent(this, new ProgressEventArgs(1, "Spider.Catalog() complete."));
            ProgressEvent(this, new ProgressEventArgs(2, "Serializing to disk location " + Preferences.CatalogFileName));

            // Serialization of the Catalog, so we can load it again if the server Application is restarted
            _Catalog.Save();
            //_Cache.Save(); //[v7]

            ProgressEvent(this, new ProgressEventArgs(3, "Save to disk " + Preferences.CatalogFileName + " successful"));

            return _Catalog;// finished, return to the calling code to 'use'
        }