Example #1
0
        private bool isInIndex(IndexableFileInfo fileInfo)
        {
            IndexSearcher searcher = new IndexSearcher(this.luceneIndexDir);

            try
            {
                BooleanQuery bq = new BooleanQuery();
                bq.Add(new TermQuery(new Term("filename", fileInfo.Filename)), BooleanClause.Occur.MUST);

                bq.Add(new TermQuery(new Term("LastModified", DateTools.DateToString(fileInfo.LastModified, DateTools.Resolution.SECOND))), BooleanClause.Occur.MUST);

                Hits hits  = searcher.Search(bq);
                int  count = hits.Length();

                if (count > 0)
                {
                    return(true);
                }
            }
            catch (Exception ex)
            {
                Console.Write(ex.Message);
            }
            finally
            {
                searcher.Close();
            }
            return(false);
        }
Example #2
0
        public static void doIndex(string LuceneIndexDir, string SpellingIndexDir, IndexCreationMode indexCreationMode, IndexableFileInfo[] fileInfos, object ThreadState, onAddFileToIndex AddFileToIndex)
        {
            if (indexing)
            {
                return;
            }
            lock (padlock)
            {
                indexing = true;


                LuceneIndexer indexer = new LuceneIndexer(LuceneIndexDir, indexCreationMode); // create new index
                try
                {
                    for (int i = 0; i < fileInfos.Length; i++)
                    {
                        IndexableFileInfo fi = fileInfos[i];
                        if (AddFileToIndex != null)
                        {
                            AddFileToIndex(fi, (double)i / (double)fileInfos.Length);
                        }

                        indexer.addFileInfoToIndex(fi);
                    } // foreach
                }
                finally
                {
                    indexer.CloseIndexWriter(OptimizeMode.DoNotOptimize);
                }
                if (indexCreationMode == IndexCreationMode.AppendToExistingIndex)
                {
                    removeAllDuplicateAndDeletedFiles(fileInfos, LuceneIndexDir, indexCreationMode);
                }
                try
                {
                    doSpellCheckerIndexing(LuceneIndexDir, SpellingIndexDir);
                }
                catch
                { }



                indexing = false;
            }
        } // doIndex
Example #3
0
        public void addFileInfoToIndex(IndexableFileInfo fileInfo)
        {
            if ((_indexCreationMode == IndexCreationMode.AppendToExistingIndex) && isInIndex(fileInfo))
            {
                return;
            }

            bool fileExistsOnDisk = System.IO.File.Exists(fileInfo.Filename);

            if (fileExistsOnDisk && fileInfo.Contents == "")
            {
                fileInfo.Contents = IFilterFileContents.getFileContents(fileInfo.Filename);
            }

            Document doc = new Document();

            /* From http://www.webreference.com/programming/lucene/2/
             * Field.Keyword Isn't analyzed, but is indexed and stored in the index verbatim. This type is suitable for fields whose original value should be preserved in its entirety, such as URLs, file system paths, dates, personal names, Social Security numbers, telephone numbers, and so on. For example, we used the file system path in Indexer (listing 1.1) as a Keyword field.
             * Field.UnIndexed Is neither analyzed nor indexed, but its value is stored in the index as is. This type is suitable for fields that you need to display with search results (such as a URL or database primary key), but whose values you'll never search directly. Since the original value of a field of this type is stored in the index, this type isn't suitable for storing fields with very large values, if index size is an issue.
             * Field.UnStored The opposite of UnIndexed. This field type is analyzed and indexed but isn't stored in the index. It's suitable for indexing a large amount of text that doesn't need to be retrieved in its original form, such as bodies of web pages, or any other type of text document
             * Field.Text Is analyzed, and is indexed. This implies that fields of this type can be searched against, but be cautious about the field size. If the data indexed is a String, it's also stored; but if the data (as in our Indexer example) is from a Reader, it isn't stored. This is often a source of confusion, so take note of this difference when using Field.Text.
             */

            // -- add fields to the document
            // doc.Add(Field.Keyword("docId", dmsDoc.DocumentId.ToString()));

            doc.Add(new Field("contents", fileInfo.Contents, Field.Store.YES, Field.Index.TOKENIZED));          // can be searched and is analyzed
            doc.Add(new Field("filename", fileInfo.Filename, Field.Store.YES, Field.Index.UN_TOKENIZED));       // can be searched, but is not analyzed
            doc.Add(new Field("filenameParams", fileInfo.FilenameParameters, Field.Store.YES, Field.Index.NO)); // can not be searched
            doc.Add(new Field("contentIsPageSummary", Convert.ToString(fileInfo.ContentIsPageSummary), Field.Store.YES, Field.Index.NO));

            doc.Add(new Field("SectionName", fileInfo.SectionName, Field.Store.YES, Field.Index.UN_TOKENIZED));


            doc.Add(new Field("LastModified", DateTools.DateToString(fileInfo.LastModified, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.UN_TOKENIZED));

            Field titleField = new Field("title", fileInfo.Title, Field.Store.YES, Field.Index.TOKENIZED);

            titleField.SetBoost(TitleFieldBoost); // default value is 1.0
            doc.Add(titleField);

            if (fileExistsOnDisk)
            {
                System.IO.DirectoryInfo di = new System.IO.DirectoryInfo(new System.IO.FileInfo(fileInfo.Filename).Directory.FullName);
                doc.Add(new Field("directparentdirectory", di.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                while (di != null)
                {
                    doc.Add(new Field("parentdirectory", di.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
                    di = di.Parent;
                }


                string ext = System.IO.Path.GetExtension(fileInfo.Filename);
                ext = ext.ToLower();
                if (ext.StartsWith("."))
                {
                    ext = ext.Substring(1);
                }
                doc.Add(new Field("filetype", ext, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }

            doc.Add(new Field("dateIndexed", DateTools.DateToString(DateTime.Now, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.NO));
            // -- add the document to the index
            writer.AddDocument(doc);
        } // addDMSDocToIndex
Example #4
0
        } // constructor

        /// <summary>
        /// Searches the keyword index using the keywordQuery.
        ///
        /// See http://www.dotlucene.net/documentation/QuerySyntax.html  for the format of the keywordQuery.
        ///
        /// This function will return a fully-filled array of IndexableFileInfo objects.
        /// </summary>
        /// <param name="keywordQuery"></param>
        /// <param name="queryForHighlighter"></param>
        /// <returns></returns>
        public IndexableFileInfo[] doSearch(string keywordQuery, string queryForHighlighter)
        {
            IndexSearcher searcher;
            IndexReader   indexReader;

            try
            {
                FSDirectory indexDir = FSDirectory.GetDirectory(this.luceneIndexDir, false);
                indexReader = IndexReader.Open(indexDir);
                searcher    = new IndexSearcher(indexReader);
            }
            catch
            {
                // if the luceneIndexDir does not contain index files (yet), IndexSearcher
                // throws a nice Exception.
                return(new IndexableFileInfo[0]);
            }
            List <IndexableFileInfo> arrayList = new List <IndexableFileInfo>();

            try
            {
                string Query = keywordQuery;
                if (Query == String.Empty)
                {
                    return(new IndexableFileInfo[0]);
                }

                string HighlighterQuery = queryForHighlighter;
                // -- weirdly enough, when the query is empty, an exception is thrown during the QueryParser.Parse
                //    this hack gets around that.
                if (HighlighterQuery == String.Empty)
                {
                    HighlighterQuery = Guid.NewGuid().ToString();
                }

                // parse the query, "text" is the default field to search
                // note: use the StandardAnalyzer! (the SimpleAnalyzer doesn't work correctly when searching by fields that are integers!)
                // MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new string[] { "title", "contents" }, new hatWebPortalAnalyzer());
                MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new string[] { "title", "contents" }, new SimpleAnalyzer());
                queryParser.SetDefaultOperator(QueryParser.AND_OPERATOR);

                Query query = queryParser.Parse(Query);

                QueryParser highlightQueryParser = new QueryParser("contents", new hatWebPortalAnalyzer());

                Query highlighterQuery = highlightQueryParser.Parse(HighlighterQuery);

                query = searcher.Rewrite(query); // is this needed?? " Expert: called to re-write queries into primitive queries."

                // search
                Hits hits = searcher.Search(query, Sort.RELEVANCE);

                // create highlighter
                Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter("<strong>", "</strong>"), new QueryScorer(highlighterQuery));

                // -- go through hits and return results

                for (int i = 0; i < hits.Length(); i++)
                {
                    Document d                    = hits.Doc(i);
                    string   filename             = d.Get("filename");
                    string   plainText            = d.Get("contents");
                    string   title                = d.Get("title");
                    string   sectionName          = d.Get("SectionName");
                    string   filenameParams       = d.Get("filenameParams");
                    bool     contentIsPageSummary = Convert.ToBoolean(d.Get("contentIsPageSummary"));
                    double   score                = Convert.ToDouble(hits.Score(i));
                    DateTime lastModified         = DateTools.StringToDate(d.Get("LastModified"));

                    TokenStream tokenStream = new hatWebPortalAnalyzer().TokenStream("contents", new StringReader(plainText));

                    string fragment = plainText;
                    if (!contentIsPageSummary)
                    {
                        fragment = highlighter.GetBestFragments(tokenStream, plainText, 2, "...");
                    }

                    IndexableFileInfo newHit = new IndexableFileInfo(filename, filenameParams, title, fragment, sectionName, lastModified, contentIsPageSummary, score);
                    arrayList.Add(newHit);
                } // for
            }
            finally
            {
                searcher.Close();
                indexReader.Close();
            }


            return(arrayList.ToArray());
        } // SearchActiveDocument
Example #5
0
        public IndexableFileInfo[] getRelatedFiles(string title, int maxResultsToReturn)
        {
            // http://blogs.intesoft.net/post/2008/04/NHibernateSearch-using-LuceneNET-Full-Text-Index-(Part-3).aspx
            Analyzer     analyzer = new StandardAnalyzer();
            BooleanQuery query    = new BooleanQuery();

            if (title.Trim() != "")
            {
                Query titleQ = Similarity.Net.SimilarityQueries.FormSimilarQuery(title, analyzer, "title", null);
                titleQ.SetBoost(LuceneIndexer.TitleFieldBoost);
                query.Add(titleQ, BooleanClause.Occur.SHOULD);

                Query contents = Similarity.Net.SimilarityQueries.FormSimilarQuery(title, analyzer, "contents", null);
                query.Add(contents, BooleanClause.Occur.SHOULD);
            }


            // avoid the page being similar to itself!
            // query.Add(new TermQuery(new Term("title", title)), BooleanClause.Occur.MUST_NOT);


            /// IndexReader ir = ...
            /// IndexSearcher is = ...
            /// <b>
            /// MoreLikeThis mlt = new MoreLikeThis(ir);
            /// Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
            /// Query query = mlt.Like( target);
            /// </b>
            /// Hits hits = is.Search(query);

            FSDirectory   indexDir = FSDirectory.GetDirectory(this.luceneIndexDir, false);
            IndexSearcher searcher;

            try
            {
                searcher = new IndexSearcher(indexDir);
            }
            catch
            {
                // if the luceneIndexDir does not contain index files (yet), IndexSearcher
                // throws a nice Exception.
                return(new IndexableFileInfo[0]);
            }


            List <IndexableFileInfo> arrayList = new List <IndexableFileInfo>();

            Hits hits = searcher.Search(query);

            try
            {
                int num = Math.Min(maxResultsToReturn, hits.Length());

                for (int i = 0; i < num; i++)
                {
                    Document d                    = hits.Doc(i);
                    string   filename             = d.Get("filename");
                    string   plainText            = d.Get("contents");
                    string   doctitle             = d.Get("title");
                    string   filenameParams       = d.Get("filenameParams");
                    bool     contentIsPageSummary = Convert.ToBoolean(d.Get("contentIsPageSummary"));
                    DateTime lastModified         = DateTools.StringToDate(d.Get("LastModified"));
                    string   fragment             = plainText;
                    string   sectionName          = d.Get("SectionName");

                    IndexableFileInfo newHit = new IndexableFileInfo(filename, filenameParams, doctitle, fragment, sectionName, lastModified, contentIsPageSummary);
                    arrayList.Add(newHit);
                } // for
            }
            finally
            {
                searcher.Close();
            }

            return(arrayList.ToArray());
        }