/// <summary>
        ///     Manages the web page.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        public override void ManageWebPage(CrawlRequest <TArachnodeDAO> crawlRequest)
        {
            if (ApplicationSettings.InsertWebPages && crawlRequest.IsStorable)
            {
                crawlRequest.Discovery.ID = _arachnodeDAO.InsertWebPage(crawlRequest.Discovery.Uri.AbsoluteUri, ApplicationSettings.InsertWebPageResponseHeaders ? crawlRequest.WebClient.HttpWebResponse.Headers.ToString() : null, ApplicationSettings.InsertWebPageSource ? crawlRequest.Data : new byte[] { }, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, crawlRequest.CurrentDepth, ApplicationSettings.ClassifyAbsoluteUris);
            }

            if (crawlRequest.Discovery.ID.HasValue)
            {
                ManagedWebPage managedWebPage = ManageWebPage(crawlRequest.Discovery.ID.Value, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Data, crawlRequest.Encoding, crawlRequest.DataType.FullTextIndexType, ApplicationSettings.ExtractWebPageMetaData, ApplicationSettings.InsertWebPageMetaData, ApplicationSettings.SaveDiscoveredWebPagesToDisk);

                crawlRequest.ManagedDiscovery = managedWebPage;
            }
        }
        /// <summary>
        ///     Manages the web page.
        /// </summary>
        /// <param name = "webPageID">The web page ID.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "source">The source.</param>
        /// <param name = "encoding">The encoding.</param>
        /// <param name = "fullTextIndexType">Full type of the text index.</param>
        /// <param name = "extractWebPageMetaData">if set to <c>true</c> [extract web page meta data].</param>
        /// <param name = "insertWebPageMetaData">if set to <c>true</c> [insert web page meta data].</param>
        /// <param name = "saveWebPageToDisk">if set to <c>true</c> [save web page to disk].</param>
        /// <returns></returns>
        public override ManagedWebPage ManageWebPage(long webPageID, string absoluteUri, byte[] source, Encoding encoding, string fullTextIndexType, bool extractWebPageMetaData, bool insertWebPageMetaData, bool saveWebPageToDisk)
        {
            try
            {
                ManagedWebPage managedWebPage = new ManagedWebPage();

                string source2 = null;

                if (extractWebPageMetaData || saveWebPageToDisk)
                {
                    source2 = encoding.GetString(source);
                }

                if (extractWebPageMetaData)
                {
                    string source3 = HttpUtility.HtmlDecode(source2);

                    //ANODET: Enable the HtmlAgilityPack to work with bytes.
                    managedWebPage.HtmlDocument = _htmlManager.CreateHtmlDocument(source2, Encoding.Unicode);
                    managedWebPage.Tags         = UserDefinedFunctions.ExtractTags(source3).Value;
                    managedWebPage.Text         = UserDefinedFunctions.ExtractText(source3).Value;

                    #region Experimental Code comparing character parsing vs. regular expressions...

                    //bool inATag = false;

                    //StringBuilder stringBuilder = new StringBuilder();

                    //for (int i = 0; i < source3.Length; i++)
                    //{
                    //    if(source3[i] == '<')
                    //    {
                    //        inATag = true;
                    //        continue;
                    //    }

                    //    if (source3[i] == '>')
                    //    {
                    //        inATag = false;
                    //        continue;
                    //    }

                    //    if (!inATag && !char.IsControl(source3[i]))
                    //    {
                    //        stringBuilder.Append(source3[i]);
                    //    }
                    //}

                    //managedWebPage.Text = stringBuilder.ToString();

                    #endregion

                    if (insertWebPageMetaData)
                    {
                        _arachnodeDAO.InsertWebPageMetaData(webPageID, absoluteUri, encoding.GetBytes(managedWebPage.Text), managedWebPage.HtmlDocument.DocumentNode.OuterHtml);
                    }
                }

                if (saveWebPageToDisk)
                {
                    managedWebPage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, absoluteUri, fullTextIndexType);

                    managedWebPage.StreamWriter = new StreamWriter(managedWebPage.DiscoveryPath, false, encoding);

                    managedWebPage.StreamWriter.Write(source2);
                }

                return(managedWebPage);
            }
            catch (Exception exception)
            {
                //ANODET: Long paths...
#if !DEMO
                _arachnodeDAO.InsertException(absoluteUri, null, exception, false);
#endif
            }

            return(null);
        }
示例#3
0
        /// <summary>
        ///     Performs the action.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (!crawlRequest.IsStorable || crawlRequest.IsDisallowed || !crawlRequest.ProcessData || crawlRequest.WebClient.WebException != null || !crawlRequest.Discovery.ID.HasValue || crawlRequest.ManagedDiscovery == null)
            {
                return;
            }

            if (!string.IsNullOrEmpty(crawlRequest.Discovery.ID.ToString()))
            {
                if (crawlRequest.Data != null)
                {
                    //check to see if we need to commit new documents to the searchable index...
                    if (_autoCommit)
                    {
                        if (DateTime.Now.Subtract(_lastCommitDateTime).Duration().TotalMinutes >= _webSettings.CacheTimeoutInMinutes)
                        {
                            if (Monitor.TryEnter(_autoCommitLock, 0))
                            {
                                _indexWriter.Commit();

                                _lastCommitDateTime = DateTime.Now;

                                Monitor.Exit(_autoCommitLock);
                            }
                        }
                    }

                    //ANODET: Using an ID limits...
                    QueryParser queryParser = new QueryParser("indexkey", _standardAnalyzer);

                    Query query = queryParser.Parse("\"" + crawlRequest.Discovery.DiscoveryType.ToString().ToLower().Substring(0, 1) + crawlRequest.Discovery.ID + "\"");

                    Hits hits = _indexSearcher.Search(query);

                    /**/

                    float strength = (float)crawlRequest.Crawl.Crawler.ReportingManager.GetStrengthForHost(crawlRequest.Discovery.Uri.AbsoluteUri);

                    if (strength == 0)
                    {
                        strength = (float)crawlRequest.Crawl.Crawler.ReportingManager.GetPriorityForHost(crawlRequest.Discovery.Uri.AbsoluteUri);
                    }

                    //Files
                    if (_indexFiles)
                    {
                        if (crawlRequest.DataType.DiscoveryType == DiscoveryType.File)
                        {
                            Document document = new Document();

                            //check for a File document.
                            bool isAFileDocumentPresent = false;
                            int  iF;

                            for (iF = 0; iF < hits.Length(); iF++)
                            {
                                if (hits.Doc(iF).GetField("discoverytype").StringValue() == "file")
                                {
                                    isAFileDocumentPresent = true;

                                    break;
                                }
                            }

                            if (hits.Length() == 0 || !isAFileDocumentPresent)
                            {
                                document.Add(new Field("created", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));
                            }
                            else
                            {
                                document.Add(new Field("created", hits.Doc(iF).GetField("created").StringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                                document.Add(new Field("updated", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));

                                Term[] terms = new Term[1];

                                terms[0] = new Term("indexkey", hits.Doc(iF).GetField("indexkey").StringValue());

                                //we have to delete the existing document and then re-add it.  We can't "Update" documents in Lucene.net...
                                _indexWriter.DeleteDocuments(terms);
                            }

                            switch (crawlRequest.DataType.ContentType)
                            {
                            case "application/msword":
                            case "application/vnd.ms-excel":
                            case "application/vnd.ms-powerpoint":
                                CreateDocument(document, crawlRequest.Discovery.ID.Value, crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, _docManager.GetText(crawlRequest.ManagedDiscovery.DiscoveryPath) + " " + crawlRequest.Discovery.Uri.AbsoluteUri, Encoding.UTF8.CodePage, crawlRequest.DataType.FullTextIndexType, strength, crawlRequest.ManagedDiscovery.DiscoveryPath, crawlRequest.Crawl.CrawlInfo.ThreadNumber);
                                break;

                            case "application/pdf":
                                StringBuilder stringBuilder = new StringBuilder();

                                PdfReader pdfReader = new PdfReader(crawlRequest.Data);

                                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                                {
                                    stringBuilder.Append(PdfTextExtractor.GetTextFromPage(pdfReader, page) + " ");
                                }

                                CreateDocument(document, crawlRequest.Discovery.ID.Value, crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, stringBuilder + " " + crawlRequest.Discovery.Uri.AbsoluteUri, Encoding.UTF8.CodePage, crawlRequest.DataType.FullTextIndexType, strength, crawlRequest.ManagedDiscovery.DiscoveryPath, crawlRequest.Crawl.CrawlInfo.ThreadNumber);
                                break;

                            default:
                                CreateDocument(document, crawlRequest.Discovery.ID.Value, crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, Encoding.UTF8.GetString(crawlRequest.Data) + " " + crawlRequest.Discovery.Uri.AbsoluteUri, Encoding.UTF8.CodePage, crawlRequest.DataType.FullTextIndexType, strength, crawlRequest.ManagedDiscovery.DiscoveryPath, crawlRequest.Crawl.CrawlInfo.ThreadNumber);
                                break;
                            }

                            return;
                        }
                    }

                    //Images
                    if (_indexImages)
                    {
                        if (crawlRequest.DataType.DiscoveryType == DiscoveryType.Image)
                        {
                            Document document = new Document();

                            //check for an Image document.
                            bool isAnImageDocumentPresent = false;
                            int  iI;

                            for (iI = 0; iI < hits.Length(); iI++)
                            {
                                if (hits.Doc(iI).GetField("discoverytype").StringValue() == "image")
                                {
                                    isAnImageDocumentPresent = true;

                                    break;
                                }
                            }

                            if (hits.Length() == 0 || !isAnImageDocumentPresent)
                            {
                                document.Add(new Field("created", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));
                            }
                            else
                            {
                                document.Add(new Field("created", hits.Doc(iI).GetField("created").StringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                                document.Add(new Field("updated", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));

                                Term[] terms = new Term[1];

                                terms[0] = new Term("indexkey", hits.Doc(iI).GetField("indexkey").StringValue());

                                //we have to delete the existing document and then re-add it.  We can't "Update" documents in Lucene.net...
                                _indexWriter.DeleteDocuments(terms);
                            }

                            if (_applicationSettings.ExtractImageMetaData)
                            {
                                CreateDocument(document, crawlRequest.Discovery.ID.Value, crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, ((ManagedImage)crawlRequest.ManagedDiscovery).EXIFData.InnerXml + " " + crawlRequest.Discovery.Uri.AbsoluteUri, Encoding.UTF8.CodePage, crawlRequest.DataType.FullTextIndexType, strength, crawlRequest.ManagedDiscovery.DiscoveryPath, crawlRequest.Crawl.CrawlInfo.ThreadNumber);
                            }
                            else
                            {
                                CreateDocument(document, crawlRequest.Discovery.ID.Value, crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.Discovery.Uri.AbsoluteUri, Encoding.UTF8.CodePage, crawlRequest.DataType.FullTextIndexType, strength, crawlRequest.ManagedDiscovery.DiscoveryPath, crawlRequest.Crawl.CrawlInfo.ThreadNumber);
                            }

                            return;
                        }
                    }

                    //WebPages
                    if (_indexWebPages)
                    {
                        if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
                        {
                            Document document = new Document();

                            //check for a WebPage document.
                            bool isAWebPageDocumentPresent = false;
                            int  iW;

                            for (iW = 0; iW < hits.Length(); iW++)
                            {
                                if (hits.Doc(iW).GetField("discoverytype").StringValue() == "webpage")
                                {
                                    isAWebPageDocumentPresent = true;

                                    break;
                                }
                            }

                            if (hits.Length() == 0 || !isAWebPageDocumentPresent)
                            {
                                document.Add(new Field("created", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));
                            }
                            else
                            {
                                document.Add(new Field("created", hits.Doc(iW).GetField("created").StringValue(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                                document.Add(new Field("updated", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));

                                Term[] terms = new Term[1];

                                terms[0] = new Term("indexkey", hits.Doc(iW).GetField("indexkey").StringValue());

                                //we have to delete the existing document and then re-add it.  We can't "Update" documents in Lucene.net...
                                _indexWriter.DeleteDocuments(terms);
                            }

                            CreateDocument(document, crawlRequest.Discovery.ID.Value, crawlRequest.DataType.DiscoveryType, crawlRequest.Discovery.Uri.AbsoluteUri, crawlRequest.DecodedHtml, crawlRequest.Encoding.CodePage, crawlRequest.DataType.FullTextIndexType, strength, crawlRequest.ManagedDiscovery.DiscoveryPath, crawlRequest.Crawl.CrawlInfo.ThreadNumber);

                            return;
                        }
                    }
                }
            }
        }

        /// <summary>
        ///     Creates the document.
        /// </summary>
        /// <param name = "document">The document.</param>
        /// <param name = "discoveryID">The discovery ID.</param>
        /// <param name = "discoveryType">The discovery type.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "contentToIndex">The content to index.</param>
        /// <param name = "codePage">The code page.</param>
        /// <param name = "discoveryPath">The discovery path.</param>
        /// <param name = "threadNumber">The thread number.</param>
        protected virtual void CreateDocument(Document document, long discoveryID, DiscoveryType discoveryType, string absoluteUri, string contentToIndex, int codePage, string fullTextIndexType, float strength, string discoveryPath, int threadNumber)
        {
            document.Add(new Field("indexkey", discoveryType.ToString().ToLower().Substring(0, 1) + discoveryID, Field.Store.YES, Field.Index.NOT_ANALYZED));

            document.Add(new Field("discoveryid", discoveryID.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("discoverytype", discoveryType.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED));

            //Discovery
            document.Add(new Field("absoluteuri", absoluteUri, Field.Store.YES, Field.Index.ANALYZED));

            //core fields
            document.Add(new Field("text", contentToIndex, Field.Store.NO, Field.Index.ANALYZED));
            document.Add(new Field("codepage", codePage.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("title", _title.Match(contentToIndex).Groups["Title"].Value.Trim(), Field.Store.YES, Field.Index.ANALYZED));

            //DiscoveryPath
            document.Add(new Field("discoverypath", discoveryPath, Field.Store.YES, Field.Index.NO));

            //AbsoluteUri Classification
            document.Add(new Field("domain", UserDefinedFunctions.ExtractDomain(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("extension", UserDefinedFunctions.ExtractExtension(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("host", UserDefinedFunctions.ExtractHost(absoluteUri).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));
            document.Add(new Field("scheme", UserDefinedFunctions.ExtractScheme(absoluteUri, false).Value, Field.Store.YES, Field.Index.NOT_ANALYZED));

            //FullTextIndexType - used to store the extension that can be used with the default IIS MIME types configuration... (.pl images cannot be served without MIME type modification...)
            document.Add(new Field("fulltextindextype", fullTextIndexType, Field.Store.YES, Field.Index.NOT_ANALYZED));

            AddDocument(document, absoluteUri, strength);
        }

        /// <summary>
        ///     Adds the document.
        /// </summary>
        /// <param name = "document">The document.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        protected void AddDocument(Document document, string absoluteUri, float strength)
        {
            document.Add(new Field("strength", strength.ToString(), Field.Store.YES, Field.Index.NO));

            strength /= ((float)Math.Log((absoluteUri.Length - 2) - absoluteUri.Replace("/", string.Empty).Length) + 1);

            if (strength < 1)
            {
                strength = 1;
            }

            if (strength > 1000000)
            {
                strength = 1000000;
            }

            //Set Fields Boosts.
            if (document.GetField("absoluteuri") != null)
            {
                document.GetField("absoluteuri").SetBoost(4 * strength);
            }
            if (document.GetField("text") != null)
            {
                document.GetField("text").SetBoost(1 * strength);
            }
            if (document.GetField("host") != null)
            {
                document.GetField("host").SetBoost(2 * strength);
            }
            if (document.GetField("title") != null)
            {
                document.GetField("title").SetBoost(3 * strength);
            }

            document.SetBoost(strength + 1);

            _indexWriter.AddDocument(document);
        }

        private void TearDownIndexWriter()
        {
            _indexWriter.Commit();

            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings);

            consoleManager.OutputString("\tCommitting LuceneDotNetIndex.", ConsoleColor.White, ConsoleColor.Gray);

            _indexWriter.Optimize();

            consoleManager.OutputString("\tOptimizing LuceneDotNetIndex.", ConsoleColor.White, ConsoleColor.Gray);

            _indexWriter.Close();

            consoleManager.OutputString("\tClosing LuceneDotNetIndex.", ConsoleColor.White, ConsoleColor.Gray);
        }

        /// <summary>
        ///     Stops this instance.
        /// </summary>
        public override void Stop()
        {
            ConsoleManager <TArachnodeDAO> consoleManager = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings);

            consoleManager.OutputString("Saving CurrentCrawl.", ConsoleColor.Gray, ConsoleColor.Gray);

            TearDownIndexWriter();

            ManageIndexes();

            TearDownIndexWriter();
        }

        private void RebuildIndexes(string tempDirectory, long fileIDLowerBound, long fileIDUpperBound, long imageIDLowerBound, long imageIDUpperBound, long webPageIDLowerBound, long webPageIDUpperBound)
        {
            if (!Directory.Exists(tempDirectory))
            {
                Directory.CreateDirectory(tempDirectory);
            }

            _indexTempDirectory = FSDirectory.Open(new DirectoryInfo(tempDirectory));

            _indexWriter = new IndexWriter(_indexTempDirectory, _standardAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);

            SetIndexWriterDefaults();

            TArachnodeDAO arachnodeDAO = (TArachnodeDAO)Activator.CreateInstance(typeof(TArachnodeDAO), _applicationSettings.ConnectionString);

            arachnodeDAO.ApplicationSettings = _applicationSettings;
            arachnodeDAO.WebSettings         = _webSettings;

            ConsoleManager <TArachnodeDAO>   consoleManager   = new ConsoleManager <TArachnodeDAO>(_applicationSettings, _webSettings);
            ActionManager <TArachnodeDAO>    actionManager    = new ActionManager <TArachnodeDAO>(_applicationSettings, _webSettings, consoleManager);
            MemoryManager <TArachnodeDAO>    memoryManager    = new MemoryManager <TArachnodeDAO>(_applicationSettings, _webSettings);
            ReportingManager <TArachnodeDAO> reportingManager = new ReportingManager <TArachnodeDAO>(_applicationSettings, _webSettings, consoleManager);
            RuleManager <TArachnodeDAO>      ruleManager      = new RuleManager <TArachnodeDAO>(_applicationSettings, _webSettings, consoleManager);
            CacheManager <TArachnodeDAO>     cacheManager     = new CacheManager <TArachnodeDAO>(_applicationSettings, _webSettings);
            Cache <TArachnodeDAO>            cache            = new Cache <TArachnodeDAO>(_applicationSettings, _webSettings, null, actionManager, cacheManager, null, memoryManager, ruleManager);
            DiscoveryManager <TArachnodeDAO> discoveryManager = new DiscoveryManager <TArachnodeDAO>(_applicationSettings, _webSettings, cache, actionManager, cacheManager, memoryManager, ruleManager);
            HtmlManager <TArachnodeDAO>      htmlManager      = new HtmlManager <TArachnodeDAO>(_applicationSettings, _webSettings, discoveryManager);

            //Files
            if (_indexFiles)
            {
                FileManager <TArachnodeDAO> fileManager = new FileManager <TArachnodeDAO>(_applicationSettings, _webSettings, discoveryManager, arachnodeDAO);

                for (long i = fileIDLowerBound; i <= fileIDUpperBound; i++)
                {
                    try
                    {
                        //get the File from the database.  we need the source data as we don't store this in the index.
                        //even though most of the fields are available in the Document, the File is the authoritative source, so we'll use that for all of the fields.
                        ArachnodeDataSet.FilesRow filesRow = arachnodeDAO.GetFile(i.ToString());

                        if (filesRow != null)
                        {
                            if (filesRow.Source == null || filesRow.Source.Length == 0)
                            {
                                filesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedFilesDirectory, filesRow.AbsoluteUri, filesRow.FullTextIndexType));
                            }

                            /**/

                            float strength = (float)reportingManager.GetStrengthForHost(filesRow.AbsoluteUri);

                            if (strength == 0)
                            {
                                strength = (float)reportingManager.GetPriorityForHost(filesRow.AbsoluteUri);
                            }

                            /**/

                            //manage the File to update the discovery path if needed and to ensure the File is saved to disk.
                            ManagedFile managedFile = fileManager.ManageFile(null, filesRow.ID, filesRow.AbsoluteUri, filesRow.Source, filesRow.FullTextIndexType, _applicationSettings.ExtractFileMetaData, _applicationSettings.InsertFileMetaData, _applicationSettings.SaveDiscoveredFilesToDisk);

                            Document document = new Document();

                            document.Add(new Field("created", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));
                            document.Add(new Field("updated", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));

                            switch (UserDefinedFunctions.ExtractResponseHeader(filesRow.ResponseHeaders, "Content-Type:", false).Value)
                            {
                            case "application/msword":
                            case "application/vnd.ms-excel":
                            case "application/vnd.ms-powerpoint":
                                CreateDocument(document, filesRow.ID, DiscoveryType.File, filesRow.AbsoluteUri, _docManager.GetText(managedFile.DiscoveryPath) + " " + filesRow.AbsoluteUri, Encoding.UTF8.CodePage, filesRow.FullTextIndexType, strength, managedFile.DiscoveryPath, 1);
                                break;

                            case "application/pdf":
                                StringBuilder stringBuilder = new StringBuilder();

                                PdfReader pdfReader = new PdfReader(filesRow.Source);

                                for (int page = 1; page <= pdfReader.NumberOfPages; page++)
                                {
                                    stringBuilder.Append(_pdfManager.ExtractText(pdfReader.GetPageContent(page)) + " ");
                                }

                                CreateDocument(document, filesRow.ID, DiscoveryType.File, filesRow.AbsoluteUri, stringBuilder + " " + filesRow.AbsoluteUri, Encoding.UTF8.CodePage, filesRow.FullTextIndexType, strength, managedFile.DiscoveryPath, 1);
                                break;

                            default:
                                CreateDocument(document, filesRow.ID, DiscoveryType.File, filesRow.AbsoluteUri, Encoding.UTF8.GetString(filesRow.Source) + " " + filesRow.AbsoluteUri, Encoding.UTF8.CodePage, filesRow.FullTextIndexType, strength, managedFile.DiscoveryPath, 1);
                                break;
                            }

                            Console.WriteLine("File: '" + filesRow.AbsoluteUri + "' indexed. (" + i + " of " + (fileIDUpperBound - fileIDLowerBound + 1) + ")");
                        }
                        else
                        {
                            Console.WriteLine("File: " + i + " was not found. (" + i + " of " + (fileIDUpperBound - fileIDLowerBound + 1) + ")");
                        }
                    }
                    catch (Exception exception)
                    {
                        Console.WriteLine("File: " + i + " was not indexed. (" + i + " of " + (fileIDUpperBound - fileIDLowerBound + 1) + ")");

                        arachnodeDAO.InsertException(null, null, exception, false);
                    }
                }
            }

            //Images
            if (_indexImages)
            {
                ImageManager <TArachnodeDAO> imageManager = new ImageManager <TArachnodeDAO>(_applicationSettings, _webSettings, discoveryManager, arachnodeDAO);

                for (long i = imageIDLowerBound; i <= imageIDUpperBound; i++)
                {
                    try
                    {
                        //get the Image from the database.  we need the source data as we don't store this in the index.
                        //even though most of the fields are available in the Document, the Image is the authoritative source, so we'll use that for all of the fields.
                        ArachnodeDataSet.ImagesRow imagesRow = arachnodeDAO.GetImage(i.ToString());

                        if (imagesRow != null)
                        {
                            if (imagesRow.Source == null || imagesRow.Source.Length == 0)
                            {
                                imagesRow.Source = File.ReadAllBytes(discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedImagesDirectory, imagesRow.AbsoluteUri, imagesRow.FullTextIndexType));
                            }

                            /**/

                            float strength = (float)reportingManager.GetStrengthForHost(imagesRow.AbsoluteUri);

                            if (strength == 0)
                            {
                                strength = (float)reportingManager.GetPriorityForHost(imagesRow.AbsoluteUri);
                            }

                            /**/

                            //manage the Image to update the discovery path if needed and to ensure the Image is saved to disk.
                            ManagedImage managedImage = imageManager.ManageImage(null, imagesRow.ID, imagesRow.AbsoluteUri, imagesRow.Source, imagesRow.FullTextIndexType, _applicationSettings.ExtractImageMetaData, _applicationSettings.InsertImageMetaData, _applicationSettings.SaveDiscoveredImagesToDisk);

                            ArachnodeDataSet.ImagesMetaDataRow imagesMetaDataRow = arachnodeDAO.GetImageMetaData(imagesRow.ID);

                            Document document = new Document();
                            //ANODET: Remove the UriClassification for Files, Images and WebPages... add the alt tags!

                            document.Add(new Field("created", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));
                            document.Add(new Field("updated", DateTime.Now.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));

                            if (imagesMetaDataRow != null)
                            {
                                CreateDocument(document, imagesRow.ID, DiscoveryType.Image, imagesRow.AbsoluteUri, imagesMetaDataRow.EXIFData + " " + imagesRow.AbsoluteUri, Encoding.UTF8.CodePage, imagesRow.FullTextIndexType, strength, managedImage.DiscoveryPath, 1);
                            }
                            else
                            {
                                CreateDocument(document, imagesRow.ID, DiscoveryType.Image, imagesRow.AbsoluteUri, imagesRow.AbsoluteUri, Encoding.UTF8.CodePage, imagesRow.FullTextIndexType, strength, managedImage.DiscoveryPath, 1);
                            }

                            Console.WriteLine("Image: '" + imagesRow.AbsoluteUri + "' indexed. (" + i + " of " + (imageIDUpperBound - imageIDLowerBound + 1) + ")");
                        }
                        else
                        {
                            Console.WriteLine("Image: " + i + " was not found. (" + i + " of " + (imageIDUpperBound - imageIDLowerBound + 1) + ")");
                        }
                    }
                    catch (Exception exception)
                    {
                        Console.WriteLine("Image: " + i + " was not indexed. (" + i + " of " + (imageIDUpperBound - imageIDLowerBound + 1) + ")");

                        arachnodeDAO.InsertException(null, null, exception, false);
                    }
                }
            }

            //WebPages
            if (_indexWebPages)
            {
                WebPageManager <TArachnodeDAO> webPageManager = new WebPageManager <TArachnodeDAO>(_applicationSettings, _webSettings, discoveryManager, htmlManager, arachnodeDAO);

                for (long i = webPageIDLowerBound; i <= webPageIDUpperBound; i++)
                {
                    try
                    {
                        //get the WebPage from the database.  we need the source data as we don't store this in the index.
                        //even though most of the fields are available in the Document, the WebPage is the authoritative source, so we'll use that for all of the fields.
                        ArachnodeDataSet.WebPagesRow webPagesRow = arachnodeDAO.GetWebPage(i.ToString());

                        if (webPagesRow != null)
                        {
                            Encoding encoding = Encoding.GetEncoding(webPagesRow.CodePage);

                            if (webPagesRow.Source == null || webPagesRow.Source.Length == 0)
                            {
                                using (StreamReader streamReader = File.OpenText(discoveryManager.GetDiscoveryPath(_applicationSettings.DownloadedWebPagesDirectory, webPagesRow.AbsoluteUri, webPagesRow.FullTextIndexType)))
                                {
                                    webPagesRow.Source = encoding.GetBytes(streamReader.ReadToEnd());
                                }
                            }

                            /**/

                            float strength = (float)reportingManager.GetStrengthForHost(webPagesRow.AbsoluteUri);

                            if (strength == 0)
                            {
                                strength = (float)reportingManager.GetPriorityForHost(webPagesRow.AbsoluteUri);
                            }

                            /**/

                            //manage the WebPage to update the discovery path if needed and to ensure the WebPage is saved to disk.
                            ManagedWebPage managedWebPage = webPageManager.ManageWebPage(webPagesRow.ID, webPagesRow.AbsoluteUri, webPagesRow.Source, encoding, webPagesRow.FullTextIndexType, _applicationSettings.ExtractWebPageMetaData, _applicationSettings.InsertWebPageMetaData, _applicationSettings.SaveDiscoveredWebPagesToDisk);

                            Document document = new Document();

                            document.Add(new Field("created", webPagesRow.InitiallyDiscovered.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));

                            if (!webPagesRow.IsLastModifiedNull())
                            {
                                document.Add(new Field("updated", webPagesRow.LastModified.Date.ToString("yyyyMMdd"), Field.Store.YES, Field.Index.NOT_ANALYZED));
                            }

                            CreateDocument(document, webPagesRow.ID, DiscoveryType.WebPage, webPagesRow.AbsoluteUri, encoding.GetString(webPagesRow.Source), encoding.CodePage, webPagesRow.FullTextIndexType, strength, managedWebPage.DiscoveryPath, 1);

                            Console.WriteLine("WebPage: '" + webPagesRow.AbsoluteUri + "' indexed. (" + i + " of " + (webPageIDUpperBound - webPageIDLowerBound + 1) + ")");
                        }
                        else
                        {
                            Console.WriteLine("WebPageID: " + i + " was not found. (" + i + " of " + (webPageIDUpperBound - webPageIDLowerBound + 1) + ")");
                        }
                    }
示例#4
0
        /// <summary>
        ///     Performs the action.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //use this instead: http://code.google.com/p/boilerpipe/

            /*if (!crawlRequest.ProcessData)
             * {
             *  return;
             * }*/

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
            {
                if (crawlRequest.Data != null)
                {
                    ManagedWebPage managedWebPage = ((ManagedWebPage)crawlRequest.ManagedDiscovery);

                    if (managedWebPage.HtmlDocument == null)
                    {
                        managedWebPage.HtmlDocument = crawlRequest.Crawl.Crawler.HtmlManager.CreateHtmlDocument(crawlRequest.Html, Encoding.Unicode);
                    }

                    IDictionary <string, XPathInfo> xPathInfos = new Dictionary <string, XPathInfo>();

                    xPathInfos = GenerateXPaths(managedWebPage.HtmlDocument.DocumentNode, string.Empty, xPathInfos);

                    //string dateXPath = ExtractDateXPath(htmlDocument1, xpathInfos);

                    //List<string> dates = htmlDocument1.DocumentNode.SelectNodes(dateXPath).OfType<HtmlNode>().Select(h => h.InnerText).ToList();

                    ProcessXPaths(xPathInfos);

                    List <XPathInfo> xPathInfos2 = xPathInfos.Values.OrderByDescending(x => x.LevenstheinDistance).ToList();

                    int numberOfSlashes = 0;

                    IDictionary <string, XPathInfo> xPathInfos3 = new Dictionary <string, XPathInfo>();

                    int xPaths = 0;
                    int minimumNumberOfXPaths = 5;

                    foreach (XPathInfo xPathInfo in xPathInfos2)
                    {
                        int numberOfSlashes2 = xPathInfo.XPath.Length - xPathInfo.XPath.Replace("/", string.Empty).Length;

                        if (numberOfSlashes2 > numberOfSlashes)
                        {
                            numberOfSlashes = numberOfSlashes2;

                            xPathInfos3.Add(xPathInfo.XPath, xPathInfo);
                        }
                        else
                        {
                            if (xPaths++ > minimumNumberOfXPaths)
                            {
                                break;
                            }
                        }
                    }

                    StringBuilder stringBuilder = new StringBuilder();

                    Dictionary <string, XPathInfo> dictionary = new Dictionary <string, XPathInfo>();

                    foreach (XPathInfo xPathInfo in xPathInfos3.Values)
                    {
                        //stringBuilder.Remove(0, stringBuilder.Length);

                        foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(xPathInfo.XPath))
                        {
                            string text = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value;

                            if (!dictionary.ContainsKey(text))
                            {
                                XPathInfo xPathInfo2 = new XPathInfo();

                                xPathInfo2.XPath = xPathInfo.XPath;

                                dictionary.Add(text, xPathInfo2);
                            }

                            dictionary[text].Count++;
                        }
                    }

                    Dictionary <string, XPathInfo> dictionary2 = new Dictionary <string, XPathInfo>();

                    foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary)
                    {
                        if (!string.IsNullOrEmpty(keyValuePair.Key.Trim()))
                        {
                            dictionary2.Add(keyValuePair.Key, keyValuePair.Value);
                        }
                    }

                    foreach (string key in dictionary.Keys)
                    {
                        foreach (string key2 in dictionary.Keys)
                        {
                            if (!string.IsNullOrEmpty(key.Trim()) && !string.IsNullOrEmpty(key2.Trim()))
                            {
                                if (key.Contains(key2) || key2.Contains(key))
                                {
                                    dictionary2[key].Count++;
                                    dictionary2[key2].Count++;
                                }
                            }
                        }
                    }

                    int dictionary2Max = dictionary2.Max(d => d.Value.Count);

                    foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary2)
                    {
                        if (keyValuePair.Value.Count == dictionary2Max)
                        {
                            foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(keyValuePair.Value.XPath))
                            {
                                stringBuilder.Append(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value);
                            }
                        }
                    }

                    MessageBox.Show(stringBuilder.ToString());

                    //return stringBuilder.ToString();
                }
            }
        }
示例#5
0
 public static string GetFullPageHTML(ManagedWebPage page)
 {
     Debug.Out($"Generating full HTML for \"{page.Title}\" ({page.RelativeURL})", "HTML BUILDER");
     // return "<!DOCTYPE html>\n<head>\n" + GlobalString.HTML_BUILDER_SIGNATURE + "\n<title>" + page.Title + " - egartley.net</title>\n" + CoreManager.GetModuleByTag("header").Code + "\n" + page.AdditionalHeaderHTML + "\n</head>\n<body>\n<div class=\"root-container\">\n<div class=\"container navbar-container\">\n" + GetNavigationBarHTML(page.RelativeURL.ToLower()) + "\n</div>\n<div class=\"container-container\">\n<div class=\"container pagecontent-container\">\n<div class=\"page-title\">\n<span>" + page.Title + "</span>\n</div\n><div class=\"page-meta\">\n<span class=\"secondary-text\">Last updated " + page.GetLastUpdatedAsString() + "</span>\n</div>\n<div class=\"page-content\">\n" + page.ContentHTML + "\n</div>\n</div>\n<div class=\"container sidebar-container\">\n" + CoreManager.GetModuleByTag("sidebar").Code + "\n</div>\n</div>\n</div>\n<div class=\"container footer-container\">\n" + CoreManager.GetModuleByTag("footer").Code + "\n</div>\n</body>\n</html>";
     return("<!DOCTYPE html>\n<head>\n" + GlobalString.HTML_BUILDER_SIGNATURE + "\n<title>" + page.Title + " - egartley.net</title>\n" + CoreManager.GetModuleByTag("header").Code + "\n" + page.AdditionalHeaderHTML + "\n</head>\n<body><div class=\"root-container\"><div class=\"navigation-bar-container\">" + GetNavigationBarHTML(page.RelativeURL.ToLower()) + "</div><div class=\"content-container\"><div class=\"page-container content-card\"><div class=\"page-content-container\"><span class=\"page-title\">" + page.Title + "</span><span class=\"gradient-spacer\"></span><span class=\"page-meta\">Last updated " + page.GetLastUpdatedAsString() + "</span><div class=\"content\">" + page.ContentHTML + "</div></div></div><div class=\"sidebar-container content-card\">" + CoreManager.GetModuleByTag("sidebar").Code + "</div></div><div class=\"footer-container\">" + CoreManager.GetModuleByTag("footer").Code + "</div></div></body></html>");
 }